From 5cf59d79c092c84cde294facc6c9295a1bffe6a0 Mon Sep 17 00:00:00 2001
From: raju <raju.ptvs@gmail.com>
Date: Fri, 31 Oct 2025 18:49:21 -0700
Subject: [PATCH 1/4] uint init fix

---
 problems/p03/p03.mojo               | 4 ++--
 problems/p04/p04.mojo               | 2 +-
 problems/p04/p04_layout_tensor.mojo | 2 +-
 problems/p05/p05.mojo               | 2 +-
 problems/p05/p05_layout_tensor.mojo | 3 +--
 problems/p06/p06.mojo               | 2 +-
 problems/p07/p07_layout_tensor.mojo | 2 +-
 7 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/problems/p03/p03.mojo b/problems/p03/p03.mojo
index 474489c5..b69420d9 100644
--- a/problems/p03/p03.mojo
+++ b/problems/p03/p03.mojo
@@ -13,11 +13,11 @@ alias dtype = DType.float32
 fn add_10_guard(
     output: UnsafePointer[Scalar[dtype]],
     a: UnsafePointer[Scalar[dtype]],
-    size: Int,
+    size: UInt,
 ):
     i = thread_idx.x
     # FILL ME IN (roughly 2 lines)
-
+    
 
 # ANCHOR_END: add_10_guard
 
diff --git a/problems/p04/p04.mojo b/problems/p04/p04.mojo
index 2a954400..c47d89e4 100644
--- a/problems/p04/p04.mojo
+++ b/problems/p04/p04.mojo
@@ -13,7 +13,7 @@ alias dtype = DType.float32
 fn add_10_2d(
     output: UnsafePointer[Scalar[dtype]],
     a: UnsafePointer[Scalar[dtype]],
-    size: Int,
+    size: UInt,
 ):
     row = thread_idx.y
     col = thread_idx.x
diff --git a/problems/p04/p04_layout_tensor.mojo b/problems/p04/p04_layout_tensor.mojo
index 01e4b3f3..536312db 100644
--- a/problems/p04/p04_layout_tensor.mojo
+++ b/problems/p04/p04_layout_tensor.mojo
@@ -14,7 +14,7 @@ alias layout = Layout.row_major(SIZE, SIZE)
 fn add_10_2d(
     output: LayoutTensor[mut=True, dtype, layout],
     a: LayoutTensor[mut=True, dtype, layout],
-    size: Int,
+    size: UInt,
 ):
     row = thread_idx.y
     col = thread_idx.x
diff --git a/problems/p05/p05.mojo b/problems/p05/p05.mojo
index 37e8aa83..aeec5cb9 100644
--- a/problems/p05/p05.mojo
+++ b/problems/p05/p05.mojo
@@ -14,7 +14,7 @@ fn broadcast_add(
     output: UnsafePointer[Scalar[dtype]],
     a: UnsafePointer[Scalar[dtype]],
     b: UnsafePointer[Scalar[dtype]],
-    size: Int,
+    size: UInt,
 ):
     row = thread_idx.y
     col = thread_idx.x
diff --git a/problems/p05/p05_layout_tensor.mojo b/problems/p05/p05_layout_tensor.mojo
index 42fee181..2548e088 100644
--- a/problems/p05/p05_layout_tensor.mojo
+++ b/problems/p05/p05_layout_tensor.mojo
@@ -21,13 +21,12 @@ fn broadcast_add[
     output: LayoutTensor[mut=True, dtype, out_layout],
     a: LayoutTensor[mut=False, dtype, a_layout],
     b: LayoutTensor[mut=False, dtype, b_layout],
-    size: Int,
+    size: UInt,
 ):
     row = thread_idx.y
     col = thread_idx.x
     # FILL ME IN (roughly 2 lines)
 
-
 # ANCHOR_END: broadcast_add_layout_tensor
 def main():
     with DeviceContext() as ctx:
diff --git a/problems/p06/p06.mojo b/problems/p06/p06.mojo
index c679b21a..f55e0733 100644
--- a/problems/p06/p06.mojo
+++ b/problems/p06/p06.mojo
@@ -13,7 +13,7 @@ alias dtype = DType.float32
 fn add_10_blocks(
     output: UnsafePointer[Scalar[dtype]],
     a: UnsafePointer[Scalar[dtype]],
-    size: Int,
+    size: UInt,
 ):
     i = block_dim.x * block_idx.x + thread_idx.x
     # FILL ME IN (roughly 2 lines)
diff --git a/problems/p07/p07_layout_tensor.mojo b/problems/p07/p07_layout_tensor.mojo
index 8f939fe4..0679d0d1 100644
--- a/problems/p07/p07_layout_tensor.mojo
+++ b/problems/p07/p07_layout_tensor.mojo
@@ -18,7 +18,7 @@ fn add_10_blocks_2d[
 ](
     output: LayoutTensor[mut=True, dtype, out_layout],
     a: LayoutTensor[mut=False, dtype, a_layout],
-    size: Int,
+    size: UInt,
 ):
     row = block_dim.y * block_idx.y + thread_idx.y
     col = block_dim.x * block_idx.x + thread_idx.x

From dc34ebe70383c28821e39b505ea3d433f8ad51fc Mon Sep 17 00:00:00 2001
From: raju <raju.ptvs@gmail.com>
Date: Fri, 31 Oct 2025 18:50:58 -0700
Subject: [PATCH 2/4] changed Int to UInt

---
 problems/p07/p07.mojo                 |  2 +-
 problems/p08/p08.mojo                 |  2 +-
 problems/p08/p08_layout_tensor.mojo   |  2 +-
 problems/p10/p10.mojo                 |  4 +--
 problems/p11/p11.mojo                 |  2 +-
 problems/p11/p11_layout_tensor.mojo   |  2 +-
 problems/p12/p12.mojo                 |  2 +-
 problems/p12/p12_layout_tensor.mojo   |  2 +-
 problems/p14/p14.mojo                 |  4 +--
 problems/p15/p15.mojo                 |  2 +-
 problems/p17/op/conv1d.mojo           |  8 ++---
 problems/p18/op/softmax.mojo          |  8 ++---
 problems/p19/op/attention.mojo        | 20 +++++------
 problems/p20/op/conv1d.mojo           |  4 +--
 problems/p21/op/embedding.mojo        | 32 ++++++++---------
 problems/p22/op/layernorm_linear.mojo | 52 +++++++++++++--------------
 problems/p23/p23.mojo                 | 46 ++++++++++++------------
 problems/p24/p24.mojo                 | 20 +++++------
 problems/p25/p25.mojo                 | 12 +++----
 problems/p26/p26.mojo                 | 10 +++---
 problems/p27/p27.mojo                 | 14 ++++----
 problems/p29/p29.mojo                 |  4 +--
 problems/p30/p30.mojo                 | 10 +++---
 problems/p31/p31.mojo                 | 12 +++----
 problems/p32/p32.mojo                 |  8 ++---
 problems/p33/p33.mojo                 |  2 +-
 problems/p34/p34.mojo                 | 12 +++----
 27 files changed, 149 insertions(+), 149 deletions(-)

diff --git a/problems/p07/p07.mojo b/problems/p07/p07.mojo
index 09db5cd1..335b2c25 100644
--- a/problems/p07/p07.mojo
+++ b/problems/p07/p07.mojo
@@ -13,7 +13,7 @@ alias dtype = DType.float32
 fn add_10_blocks_2d(
     output: UnsafePointer[Scalar[dtype]],
     a: UnsafePointer[Scalar[dtype]],
-    size: Int,
+    size: UInt,
 ):
     row = block_dim.y * block_idx.y + thread_idx.y
     col = block_dim.x * block_idx.x + thread_idx.x
diff --git a/problems/p08/p08.mojo b/problems/p08/p08.mojo
index dd74f555..60b1043e 100644
--- a/problems/p08/p08.mojo
+++ b/problems/p08/p08.mojo
@@ -16,7 +16,7 @@ alias dtype = DType.float32
 fn add_10_shared(
     output: UnsafePointer[Scalar[dtype]],
     a: UnsafePointer[Scalar[dtype]],
-    size: Int,
+    size: UInt,
 ):
     shared = stack_allocation[
         TPB,
diff --git a/problems/p08/p08_layout_tensor.mojo b/problems/p08/p08_layout_tensor.mojo
index a6fce741..fc817376 100644
--- a/problems/p08/p08_layout_tensor.mojo
+++ b/problems/p08/p08_layout_tensor.mojo
@@ -19,7 +19,7 @@ fn add_10_shared_layout_tensor[
 ](
     output: LayoutTensor[mut=True, dtype, layout],
     a: LayoutTensor[mut=True, dtype, layout],
-    size: Int,
+    size: UInt,
 ):
     # Allocate shared memory using LayoutTensor with explicit address_space
     shared = LayoutTensor[
diff --git a/problems/p10/p10.mojo b/problems/p10/p10.mojo
index 1f9fdea4..205a4867 100644
--- a/problems/p10/p10.mojo
+++ b/problems/p10/p10.mojo
@@ -17,7 +17,7 @@ alias layout = Layout.row_major(SIZE, SIZE)
 fn shared_memory_race(
     output: LayoutTensor[mut=True, dtype, layout],
     a: LayoutTensor[mut=False, dtype, layout],
-    size: Int,
+    size: UInt,
 ):
     row = thread_idx.y
     col = thread_idx.x
@@ -45,7 +45,7 @@ fn shared_memory_race(
 fn add_10_2d(
     output: LayoutTensor[mut=True, dtype, layout],
     a: LayoutTensor[mut=True, dtype, layout],
-    size: Int,
+    size: UInt,
 ):
     row = thread_idx.y
     col = thread_idx.x
diff --git a/problems/p11/p11.mojo b/problems/p11/p11.mojo
index 62d3c6d1..064bde95 100644
--- a/problems/p11/p11.mojo
+++ b/problems/p11/p11.mojo
@@ -16,7 +16,7 @@ alias dtype = DType.float32
 fn pooling(
     output: UnsafePointer[Scalar[dtype]],
     a: UnsafePointer[Scalar[dtype]],
-    size: Int,
+    size: UInt,
 ):
     shared = stack_allocation[
         TPB,
diff --git a/problems/p11/p11_layout_tensor.mojo b/problems/p11/p11_layout_tensor.mojo
index b24d2f81..d6c9e62d 100644
--- a/problems/p11/p11_layout_tensor.mojo
+++ b/problems/p11/p11_layout_tensor.mojo
@@ -18,7 +18,7 @@ fn pooling[
 ](
     output: LayoutTensor[mut=True, dtype, layout],
     a: LayoutTensor[mut=True, dtype, layout],
-    size: Int,
+    size: UInt,
 ):
     # Allocate shared memory using tensor builder
     shared = LayoutTensor[
diff --git a/problems/p12/p12.mojo b/problems/p12/p12.mojo
index 690230cc..b659998e 100644
--- a/problems/p12/p12.mojo
+++ b/problems/p12/p12.mojo
@@ -17,7 +17,7 @@ fn dot_product(
     output: UnsafePointer[Scalar[dtype]],
     a: UnsafePointer[Scalar[dtype]],
     b: UnsafePointer[Scalar[dtype]],
-    size: Int,
+    size: UInt,
 ):
     # FILL ME IN (roughly 13 lines)
     ...
diff --git a/problems/p12/p12_layout_tensor.mojo b/problems/p12/p12_layout_tensor.mojo
index e94c32ee..9ac437d1 100644
--- a/problems/p12/p12_layout_tensor.mojo
+++ b/problems/p12/p12_layout_tensor.mojo
@@ -21,7 +21,7 @@ fn dot_product[
     output: LayoutTensor[mut=True, dtype, out_layout],
     a: LayoutTensor[mut=True, dtype, in_layout],
     b: LayoutTensor[mut=True, dtype, in_layout],
-    size: Int,
+    size: UInt,
 ):
     # FILL ME IN (roughly 13 lines)
     ...
diff --git a/problems/p14/p14.mojo b/problems/p14/p14.mojo
index 4a9e72a7..0a403ac7 100644
--- a/problems/p14/p14.mojo
+++ b/problems/p14/p14.mojo
@@ -20,7 +20,7 @@ fn prefix_sum_simple[
 ](
     output: LayoutTensor[mut=True, dtype, layout],
     a: LayoutTensor[mut=False, dtype, layout],
-    size: Int,
+    size: UInt,
 ):
     global_i = block_dim.x * block_idx.x + thread_idx.x
     local_i = thread_idx.x
@@ -43,7 +43,7 @@ fn prefix_sum_local_phase[
 ](
     output: LayoutTensor[mut=True, dtype, out_layout],
     a: LayoutTensor[mut=False, dtype, in_layout],
-    size: Int,
+    size: UInt,
 ):
     global_i = block_dim.x * block_idx.x + thread_idx.x
     local_i = thread_idx.x
diff --git a/problems/p15/p15.mojo b/problems/p15/p15.mojo
index fc243d9b..ee82fc34 100644
--- a/problems/p15/p15.mojo
+++ b/problems/p15/p15.mojo
@@ -23,7 +23,7 @@ fn axis_sum[
 ](
     output: LayoutTensor[mut=True, dtype, out_layout],
     a: LayoutTensor[mut=False, dtype, in_layout],
-    size: Int,
+    size: UInt,
 ):
     global_i = block_dim.x * block_idx.x + thread_idx.x
     local_i = thread_idx.x
diff --git a/problems/p17/op/conv1d.mojo b/problems/p17/op/conv1d.mojo
index f0bcd5c9..2fad3a2c 100644
--- a/problems/p17/op/conv1d.mojo
+++ b/problems/p17/op/conv1d.mojo
@@ -12,8 +12,8 @@ fn conv1d_kernel[
     in_layout: Layout,
     out_layout: Layout,
     conv_layout: Layout,
-    input_size: Int,
-    conv_size: Int,
+    input_size: UInt,
+    conv_size: UInt,
     dtype: DType = DType.float32,
 ](
     output: LayoutTensor[mut=True, dtype, out_layout],
@@ -80,8 +80,8 @@ struct Conv1DCustomOp:
     fn execute[
         # The kind of device this will be run on: "cpu" or "gpu"
         target: StaticString,
-        input_size: Int,
-        conv_size: Int,
+        input_size: UInt,
+        conv_size: UInt,
         dtype: DType = DType.float32,
     ](
         output: OutputTensor[rank=1],
diff --git a/problems/p18/op/softmax.mojo b/problems/p18/op/softmax.mojo
index d4d5621e..8629f162 100644
--- a/problems/p18/op/softmax.mojo
+++ b/problems/p18/op/softmax.mojo
@@ -18,8 +18,8 @@ alias BLOCK_DIM_X = 1 << log2_ceil(SIZE)
 
 
 fn softmax_gpu_kernel[
-    layout: Layout,
-    input_size: Int,
+    layout: Layout, 
+    input_size: UInt,
     dtype: DType = DType.float32,
 ](
     output: LayoutTensor[mut=True, dtype, layout],
@@ -35,7 +35,7 @@ fn softmax_gpu_kernel[
 # ANCHOR: softmax_cpu_kernel
 fn softmax_cpu_kernel[
     layout: Layout,
-    input_size: Int,
+    input_size: UInt,
     dtype: DType = DType.float32,
 ](
     output: LayoutTensor[dtype, layout, MutableAnyOrigin],
@@ -57,7 +57,7 @@ struct SoftmaxCustomOp:
     @staticmethod
     fn execute[
         target: StaticString,  # "cpu" or "gpu"
-        input_size: Int,
+        input_size: UInt,
         dtype: DType = DType.float32,
     ](
         output: OutputTensor[rank=1],
diff --git a/problems/p19/op/attention.mojo b/problems/p19/op/attention.mojo
index ceffc72c..0f277cb8 100644
--- a/problems/p19/op/attention.mojo
+++ b/problems/p19/op/attention.mojo
@@ -32,9 +32,9 @@ fn matmul_idiomatic_tiled[
     a_layout: Layout,
     b_layout: Layout,
     out_layout: Layout,
-    rows: Int,
-    cols: Int,
-    inner: Int,
+    rows: UInt,
+    cols: UInt,
+    inner: UInt,
     dtype: DType = DType.float32,
 ](
     output: LayoutTensor[mut=True, dtype, out_layout, MutableAnyOrigin],
@@ -120,8 +120,8 @@ fn matmul_idiomatic_tiled[
 fn transpose_kernel[
     layout_in: Layout,  # Layout for input matrix (seq_len, d)
     layout_out: Layout,  # Layout for output matrix (d, seq_len)
-    rows: Int,
-    cols: Int,
+    rows: UInt,
+    cols: UInt,
     dtype: DType = DType.float32,
 ](
     output: LayoutTensor[mut=True, dtype, layout_out, MutableAnyOrigin],
@@ -137,7 +137,7 @@ fn transpose_kernel[
 # Apply softmax to attention scores taken from p16
 fn softmax_gpu_kernel[
     layout: Layout,
-    input_size: Int,
+    input_size: UInt,
     dtype: DType = DType.float32,
 ](
     output: LayoutTensor[mut=True, dtype, layout],
@@ -209,8 +209,8 @@ fn attention_cpu_kernel[
     layout_k: Layout,
     layout_v: Layout,
     layout_out: Layout,
-    seq_len: Int,
-    d: Int,
+    seq_len: UInt,
+    d: UInt,
     dtype: DType = DType.float32,
 ](
     output: LayoutTensor[dtype, layout_out, MutableAnyOrigin],
@@ -259,8 +259,8 @@ struct AttentionCustomOp:
     @staticmethod
     fn execute[
         target: StaticString,  # "cpu" or "gpu"
-        seq_len: Int,
-        d: Int,
+        seq_len: UInt,
+        d: UInt,
         dtype: DType = DType.float32,
     ](
         output: OutputTensor[rank=1],  # Output vector (d,)
diff --git a/problems/p20/op/conv1d.mojo b/problems/p20/op/conv1d.mojo
index b03d972a..d506f5ed 100644
--- a/problems/p20/op/conv1d.mojo
+++ b/problems/p20/op/conv1d.mojo
@@ -15,8 +15,8 @@ fn conv1d_kernel[
     in_layout: Layout,
     out_layout: Layout,
     conv_layout: Layout,
-    input_size: Int,
-    conv_size: Int,
+    input_size: UInt,
+    conv_size: UInt,
     dtype: DType = DType.float32,
 ](
     output: LayoutTensor[mut=True, dtype, out_layout],
diff --git a/problems/p21/op/embedding.mojo b/problems/p21/op/embedding.mojo
index bc650460..5db57af5 100644
--- a/problems/p21/op/embedding.mojo
+++ b/problems/p21/op/embedding.mojo
@@ -13,10 +13,10 @@ fn embedding_kernel_coalesced[
     indices_layout: Layout,
     weights_layout: Layout,
     out_layout: Layout,
-    batch_size: Int,
-    seq_len: Int,
-    vocab_size: Int,
-    embed_dim: Int,
+    batch_size: UInt,
+    seq_len: UInt,
+    vocab_size: UInt,
+    embed_dim: UInt,
     dtype: DType = DType.float32,
 ](
     output: LayoutTensor[mut=True, dtype, out_layout],
@@ -57,10 +57,10 @@ fn embedding_kernel_2d[
     indices_layout: Layout,
     weights_layout: Layout,
     out_layout: Layout,
-    batch_size: Int,
-    seq_len: Int,
-    vocab_size: Int,
-    embed_dim: Int,
+    batch_size: UInt,
+    seq_len: UInt,
+    vocab_size: UInt,
+    embed_dim: UInt,
     dtype: DType = DType.float32,
 ](
     output: LayoutTensor[mut=True, dtype, out_layout],
@@ -108,10 +108,10 @@ struct EmbeddingCustomOp:
     @staticmethod
     fn execute[
         target: StaticString,
-        batch_size: Int,
-        seq_len: Int,
-        vocab_size: Int,
-        embed_dim: Int,
+        batch_size: UInt,
+        seq_len: UInt,
+        vocab_size: UInt,
+        embed_dim: UInt,
     ](
         output: OutputTensor[
             dtype = DType.float32, rank=3
@@ -194,10 +194,10 @@ struct Embedding2DCustomOp:
     @staticmethod
     fn execute[
         target: StaticString,
-        batch_size: Int,
-        seq_len: Int,
-        vocab_size: Int,
-        embed_dim: Int,
+        batch_size: UInt,
+        seq_len: UInt,
+        vocab_size: UInt,
+        embed_dim: UInt,
     ](
         output: OutputTensor[
             dtype = DType.float32, rank=3
diff --git a/problems/p22/op/layernorm_linear.mojo b/problems/p22/op/layernorm_linear.mojo
index 3d0fedda..7a798cd1 100644
--- a/problems/p22/op/layernorm_linear.mojo
+++ b/problems/p22/op/layernorm_linear.mojo
@@ -23,9 +23,9 @@ fn matmul_idiomatic_tiled[
     a_layout: Layout,
     b_layout: Layout,
     out_layout: Layout,
-    rows: Int,
-    cols: Int,
-    inner: Int,
+    rows: UInt,
+    cols: UInt,
+    inner: UInt,
     dtype: DType = DType.float32,
 ](
     output: LayoutTensor[mut=True, dtype, out_layout, MutableAnyOrigin],
@@ -115,9 +115,9 @@ fn layernorm_kernel[
     input_layout: Layout,
     ln_params_layout: Layout,
     output_layout: Layout,
-    batch_size: Int,
-    seq_len: Int,
-    hidden_dim: Int,
+    batch_size: UInt,
+    seq_len: UInt,
+    hidden_dim: UInt,
 ](
     output: LayoutTensor[mut=True, dtype, output_layout],
     input: LayoutTensor[mut=False, dtype, input_layout],
@@ -149,8 +149,8 @@ fn layernorm_kernel[
 fn transpose_kernel[
     layout_in: Layout,
     layout_out: Layout,
-    rows: Int,
-    cols: Int,
+    rows: UInt,
+    cols: UInt,
     dtype: DType = DType.float32,
 ](
     output: LayoutTensor[mut=True, dtype, layout_out, MutableAnyOrigin],
@@ -194,8 +194,8 @@ fn add_bias_kernel[
     input_layout: Layout,
     bias_layout: Layout,
     output_layout: Layout,
-    batch_size: Int,
-    seq_len: Int,
+    batch_size: UInt,
+    seq_len: UInt,
     output_dim: Int,
 ](
     output: LayoutTensor[mut=True, dtype, output_layout],
@@ -225,10 +225,10 @@ fn minimal_fused_kernel[
     weight_layout: Layout,
     bias_layout: Layout,
     output_layout: Layout,
-    batch_size: Int,
-    seq_len: Int,
-    hidden_dim: Int,
-    output_dim: Int,
+    batch_size: UInt,
+    seq_len: UInt,
+    hidden_dim: UInt,
+    output_dim: UInt,
 ](
     output: LayoutTensor[mut=True, dtype, output_layout],
     input: LayoutTensor[mut=False, dtype, input_layout],
@@ -270,10 +270,10 @@ fn minimal_fused_kernel_backward[
     grad_ln_bias_layout: Layout,
     grad_weight_layout: Layout,
     grad_bias_layout: Layout,
-    batch_size: Int,
-    seq_len: Int,
-    hidden_dim: Int,
-    output_dim: Int,
+    batch_size: UInt,
+    seq_len: UInt,
+    hidden_dim: UInt,
+    output_dim: UInt,
 ](
     grad_input: LayoutTensor[mut=True, dtype, grad_input_layout],
     grad_ln_weight: LayoutTensor[mut=True, dtype, grad_ln_weight_layout],
@@ -335,10 +335,10 @@ struct LayerNormLinearCustomOp:
     fn execute[
         target: StaticString,
         algorithm: StaticString,
-        batch_size: Int,
-        seq_len: Int,
-        hidden_dim: Int,
-        output_dim: Int,
+        batch_size: UInt,
+        seq_len: UInt,
+        hidden_dim: UInt,
+        output_dim: UInt,
     ](
         output: OutputTensor[dtype = DType.float32, rank=3],
         input: InputTensor[dtype = DType.float32, rank=3],
@@ -557,10 +557,10 @@ struct LayerNormLinearBackwardCustomOp:
     @staticmethod
     fn execute[
         target: StaticString,
-        batch_size: Int,
-        seq_len: Int,
-        hidden_dim: Int,
-        output_dim: Int,
+        batch_size: UInt,
+        seq_len: UInt,
+        hidden_dim: UInt,
+        output_dim: UInt,
     ](
         grad_input: OutputTensor[dtype = DType.float32, rank=3],
         grad_ln_weight: OutputTensor[dtype = DType.float32, rank=1],
diff --git a/problems/p23/p23.mojo b/problems/p23/p23.mojo
index 67b36029..db9f9454 100644
--- a/problems/p23/p23.mojo
+++ b/problems/p23/p23.mojo
@@ -18,7 +18,7 @@ alias SIMD_WIDTH = simd_width_of[dtype, target = get_gpu_target()]()
 
 
 fn elementwise_add[
-    layout: Layout, dtype: DType, simd_width: Int, rank: Int, size: Int
+    layout: Layout, dtype: DType, simd_width: UInt, rank: UInt, size: UInt
 ](
     output: LayoutTensor[mut=True, dtype, layout, MutableAnyOrigin],
     a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
@@ -47,10 +47,10 @@ alias TILE_SIZE = 32
 fn tiled_elementwise_add[
     layout: Layout,
     dtype: DType,
-    simd_width: Int,
-    rank: Int,
-    size: Int,
-    tile_size: Int,
+    simd_width: UInt,
+    rank: UInt,
+    size: UInt,
+    tile_size: UInt,
 ](
     output: LayoutTensor[mut=True, dtype, layout, MutableAnyOrigin],
     a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
@@ -60,7 +60,7 @@ fn tiled_elementwise_add[
     @parameter
     @always_inline
     fn process_tiles[
-        simd_width: Int, rank: Int, alignment: Int = align_of[dtype]()
+        simd_width: UInt, rank: UInt, alignment: Int = align_of[dtype]()
     ](indices: IndexList[rank]) capturing -> None:
         tile_id = indices[0]
         print("tile_id:", tile_id)
@@ -79,13 +79,13 @@ fn tiled_elementwise_add[
 
 # ANCHOR: manual_vectorized_tiled_elementwise_add
 fn manual_vectorized_tiled_elementwise_add[
-    layout: Layout,
+    layout: Layout, 
     dtype: DType,
-    simd_width: Int,
-    num_threads_per_tile: Int,
-    rank: Int,
-    size: Int,
-    tile_size: Int,
+    simd_width: UInt,
+    num_threads_per_tile: UInt,
+    rank: UInt,
+    size: UInt,
+    tile_size: UInt,
 ](
     output: LayoutTensor[mut=True, dtype, layout, MutableAnyOrigin],
     a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
@@ -98,7 +98,7 @@ fn manual_vectorized_tiled_elementwise_add[
     @parameter
     @always_inline
     fn process_manual_vectorized_tiles[
-        num_threads_per_tile: Int, rank: Int, alignment: Int = align_of[dtype]()
+        num_threads_per_tile: UInt, rank: UInt, alignment: UInt = align_of[dtype]()
     ](indices: IndexList[rank]) capturing -> None:
         tile_id = indices[0]
         print("tile_id:", tile_id)
@@ -122,11 +122,11 @@ fn manual_vectorized_tiled_elementwise_add[
 fn vectorize_within_tiles_elementwise_add[
     layout: Layout,
     dtype: DType,
-    simd_width: Int,
-    num_threads_per_tile: Int,
-    rank: Int,
-    size: Int,
-    tile_size: Int,
+    simd_width: UInt,
+    num_threads_per_tile: UInt,
+    rank: UInt,
+    size: UInt,
+    tile_size: UInt,
 ](
     output: LayoutTensor[mut=True, dtype, layout, MutableAnyOrigin],
     a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
@@ -137,7 +137,7 @@ fn vectorize_within_tiles_elementwise_add[
     @parameter
     @always_inline
     fn process_tile_with_vectorize[
-        num_threads_per_tile: Int, rank: Int, alignment: Int = align_of[dtype]()
+        num_threads_per_tile: UInt, rank: UInt, alignment: UInt = align_of[dtype]()
     ](indices: IndexList[rank]) capturing -> None:
         tile_id = indices[0]
         tile_start = tile_id * tile_size
@@ -168,7 +168,7 @@ fn vectorize_within_tiles_elementwise_add[
 @parameter
 @always_inline
 fn benchmark_elementwise_parameterized[
-    test_size: Int, tile_size: Int
+    test_size: UInt, tile_size: UInt
 ](mut b: Bencher) raises:
     bench_ctx = DeviceContext()
     alias layout = Layout.row_major(test_size)
@@ -206,7 +206,7 @@ fn benchmark_elementwise_parameterized[
 @parameter
 @always_inline
 fn benchmark_tiled_parameterized[
-    test_size: Int, tile_size: Int
+    test_size: UInt, tile_size: UInt
 ](mut b: Bencher) raises:
     bench_ctx = DeviceContext()
     alias layout = Layout.row_major(test_size)
@@ -238,7 +238,7 @@ fn benchmark_tiled_parameterized[
 @parameter
 @always_inline
 fn benchmark_manual_vectorized_parameterized[
-    test_size: Int, tile_size: Int
+    test_size: UInt, tile_size: UInt
 ](mut b: Bencher) raises:
     bench_ctx = DeviceContext()
     alias layout = Layout.row_major(test_size)
@@ -270,7 +270,7 @@ fn benchmark_manual_vectorized_parameterized[
 @parameter
 @always_inline
 fn benchmark_vectorized_parameterized[
-    test_size: Int, tile_size: Int
+    test_size: UInt, tile_size: UInt
 ](mut b: Bencher) raises:
     bench_ctx = DeviceContext()
     alias layout = Layout.row_major(test_size)
diff --git a/problems/p24/p24.mojo b/problems/p24/p24.mojo
index a0531f05..332f4eb2 100644
--- a/problems/p24/p24.mojo
+++ b/problems/p24/p24.mojo
@@ -32,7 +32,7 @@ alias out_layout = Layout.row_major(1)
 
 
 fn traditional_dot_product_p12_style[
-    in_layout: Layout, out_layout: Layout, size: Int
+    in_layout: Layout, out_layout: Layout, size: UInt
 ](
     output: LayoutTensor[mut=True, dtype, out_layout],
     a: LayoutTensor[mut=False, dtype, in_layout],
@@ -73,7 +73,7 @@ fn traditional_dot_product_p12_style[
 
 # ANCHOR: simple_warp_kernel
 fn simple_warp_dot_product[
-    in_layout: Layout, out_layout: Layout, size: Int
+    in_layout: Layout, out_layout: Layout, size: UInt
 ](
     output: LayoutTensor[mut=True, dtype, out_layout],
     a: LayoutTensor[mut=False, dtype, in_layout],
@@ -91,9 +91,9 @@ fn functional_warp_dot_product[
     layout: Layout,
     out_layout: Layout,
     dtype: DType,
-    simd_width: Int,
-    rank: Int,
-    size: Int,
+    simd_width: UInt,
+    rank: UInt,
+    size: UInt,
 ](
     output: LayoutTensor[mut=True, dtype, out_layout, MutableAnyOrigin],
     a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin],
@@ -103,7 +103,7 @@ fn functional_warp_dot_product[
     @parameter
     @always_inline
     fn compute_dot_product[
-        simd_width: Int, rank: Int, alignment: Int = align_of[dtype]()
+        simd_width: UInt, rank: UInt, alignment: UInt = align_of[dtype]()
     ](indices: IndexList[rank]) capturing -> None:
         idx = indices[0]
         print("idx:", idx)
@@ -117,7 +117,7 @@ fn functional_warp_dot_product[
 
 
 fn expected_output[
-    dtype: DType, n_warps: Int
+    dtype: DType, n_warps: UInt
 ](
     expected: HostBuffer[dtype],
     a: DeviceBuffer[dtype],
@@ -135,7 +135,7 @@ fn expected_output[
 
 
 fn rand_int[
-    dtype: DType, size: Int
+    dtype: DType, size: UInt 
 ](buff: DeviceBuffer[dtype], min: Int = 0, max: Int = 100) raises:
     with buff.map_to_host() as buff_host:
         for i in range(size):
@@ -143,7 +143,7 @@ fn rand_int[
 
 
 fn check_result[
-    dtype: DType, size: Int, print_result: Bool = False
+    dtype: DType, size: UInt, print_result: Bool = False
 ](actual: DeviceBuffer[dtype], expected: HostBuffer[dtype]) raises:
     with actual.map_to_host() as actual_host:
         if print_result:
@@ -157,7 +157,7 @@ fn check_result[
 @parameter
 @always_inline
 fn benchmark_simple_warp_parameterized[
-    test_size: Int
+    test_size: UInt
 ](mut bencher: Bencher) raises:
     alias n_warps = test_size // WARP_SIZE
     alias in_layout = Layout.row_major(test_size)
diff --git a/problems/p25/p25.mojo b/problems/p25/p25.mojo
index 0e038521..8aaa74a1 100644
--- a/problems/p25/p25.mojo
+++ b/problems/p25/p25.mojo
@@ -14,7 +14,7 @@ alias layout = Layout.row_major(SIZE)
 
 
 fn neighbor_difference[
-    layout: Layout, size: Int
+    layout: Layout, size: UInt
 ](
     output: LayoutTensor[mut=True, dtype, layout],
     input: LayoutTensor[mut=False, dtype, layout],
@@ -40,7 +40,7 @@ alias layout_2 = Layout.row_major(SIZE_2)
 
 
 fn moving_average_3[
-    layout: Layout, size: Int
+    layout: Layout, size: UInt
 ](
     output: LayoutTensor[mut=True, dtype, layout],
     input: LayoutTensor[mut=False, dtype, layout],
@@ -61,11 +61,11 @@ fn moving_average_3[
 
 # ANCHOR: broadcast_shuffle_coordination
 fn broadcast_shuffle_coordination[
-    layout: Layout, size: Int
+    layout: Layout, size: UInt
 ](
     output: LayoutTensor[mut=True, dtype, layout],
     input: LayoutTensor[mut=False, dtype, layout],
-):
+):  
     """
     Combine broadcast() and shuffle_down() for advanced warp coordination.
     Lane 0 computes block-local scaling factor, broadcasts it to all lanes in the warp.
@@ -84,7 +84,7 @@ fn broadcast_shuffle_coordination[
 
 # ANCHOR: basic_broadcast
 fn basic_broadcast[
-    layout: Layout, size: Int
+    layout: Layout, size: UInt
 ](
     output: LayoutTensor[mut=True, dtype, layout],
     input: LayoutTensor[mut=False, dtype, layout],
@@ -106,7 +106,7 @@ fn basic_broadcast[
 
 # ANCHOR: conditional_broadcast
 fn conditional_broadcast[
-    layout: Layout, size: Int
+    layout: Layout, size: UInt
 ](
     output: LayoutTensor[mut=True, dtype, layout],
     input: LayoutTensor[mut=False, dtype, layout],
diff --git a/problems/p26/p26.mojo b/problems/p26/p26.mojo
index a529d01c..57b855a0 100644
--- a/problems/p26/p26.mojo
+++ b/problems/p26/p26.mojo
@@ -14,7 +14,7 @@ alias layout = Layout.row_major(SIZE)
 
 
 fn butterfly_pair_swap[
-    layout: Layout, size: Int
+    layout: Layout, size: UInt
 ](
     output: LayoutTensor[mut=True, dtype, layout],
     input: LayoutTensor[mut=False, dtype, layout],
@@ -35,7 +35,7 @@ fn butterfly_pair_swap[
 
 # ANCHOR: butterfly_parallel_max
 fn butterfly_parallel_max[
-    layout: Layout, size: Int
+    layout: Layout, size: UInt
 ](
     output: LayoutTensor[mut=True, dtype, layout],
     input: LayoutTensor[mut=False, dtype, layout],
@@ -63,7 +63,7 @@ alias layout_2 = Layout.row_major(SIZE_2)
 
 
 fn butterfly_conditional_max[
-    layout: Layout, size: Int
+    layout: Layout, size: UInt
 ](
     output: LayoutTensor[mut=True, dtype, layout],
     input: LayoutTensor[mut=False, dtype, layout],
@@ -88,7 +88,7 @@ fn butterfly_conditional_max[
 
 # ANCHOR: warp_inclusive_prefix_sum
 fn warp_inclusive_prefix_sum[
-    layout: Layout, size: Int
+    layout: Layout, size: UInt
 ](
     output: LayoutTensor[mut=True, dtype, layout],
     input: LayoutTensor[mut=False, dtype, layout],
@@ -123,7 +123,7 @@ fn warp_inclusive_prefix_sum[
 
 # ANCHOR: warp_partition
 fn warp_partition[
-    layout: Layout, size: Int
+    layout: Layout, size: UInt
 ](
     output: LayoutTensor[mut=True, dtype, layout],
     input: LayoutTensor[mut=False, dtype, layout],
diff --git a/problems/p27/p27.mojo b/problems/p27/p27.mojo
index b21efd7e..4da14fda 100644
--- a/problems/p27/p27.mojo
+++ b/problems/p27/p27.mojo
@@ -12,12 +12,12 @@ from math import floor
 
 # ANCHOR: traditional_dot_product
 fn traditional_dot_product[
-    in_layout: Layout, out_layout: Layout, tpb: Int
+    in_layout: Layout, out_layout: Layout, tpb: UInt
 ](
     output: LayoutTensor[mut=True, dtype, out_layout],
     a: LayoutTensor[mut=False, dtype, in_layout],
     b: LayoutTensor[mut=False, dtype, in_layout],
-    size: Int,
+    size: UInt,
 ):
     """Traditional dot product using shared memory + barriers + tree reduction.
     Educational but complex - shows the manual coordination needed."""
@@ -64,12 +64,12 @@ alias dtype = DType.float32
 
 
 fn block_sum_dot_product[
-    in_layout: Layout, out_layout: Layout, tpb: Int
+    in_layout: Layout, out_layout: Layout, tpb: UInt
 ](
     output: LayoutTensor[mut=True, dtype, out_layout],
     a: LayoutTensor[mut=False, dtype, in_layout],
     b: LayoutTensor[mut=False, dtype, in_layout],
-    size: Int,
+    size: UInt,
 ):
     """Dot product using block.sum() - convenience function like warp.sum()!
     Replaces manual shared memory + barriers + tree reduction with one line."""
@@ -92,9 +92,9 @@ fn block_histogram_bin_extract[
     input_data: LayoutTensor[mut=False, dtype, in_layout],
     bin_output: LayoutTensor[mut=True, dtype, bin_layout],
     count_output: LayoutTensor[mut=True, DType.int32, out_layout],
-    size: Int,
-    target_bin: Int,
-    num_bins: Int,
+    size: UInt,
+    target_bin: UInt,
+    num_bins: UInt,
 ):
     """Parallel histogram using block.prefix_sum() for bin extraction.
 
diff --git a/problems/p29/p29.mojo b/problems/p29/p29.mojo
index acc89e8e..b929650b 100644
--- a/problems/p29/p29.mojo
+++ b/problems/p29/p29.mojo
@@ -34,7 +34,7 @@ fn multi_stage_image_blur_pipeline[
 ](
     output: LayoutTensor[mut=True, dtype, layout],
     input: LayoutTensor[mut=False, dtype, layout],
-    size: Int,
+    size: UInt,
 ):
     """Multi-stage image blur pipeline with barrier coordination.
 
@@ -93,7 +93,7 @@ fn double_buffered_stencil_computation[
 ](
     output: LayoutTensor[mut=True, dtype, layout],
     input: LayoutTensor[mut=False, dtype, layout],
-    size: Int,
+    size: UInt,
 ):
     """Double-buffered stencil computation with memory barrier coordination.
 
diff --git a/problems/p30/p30.mojo b/problems/p30/p30.mojo
index 1f708bba..d683abd3 100644
--- a/problems/p30/p30.mojo
+++ b/problems/p30/p30.mojo
@@ -19,7 +19,7 @@ fn kernel1[
     output: LayoutTensor[mut=True, dtype, layout],
     a: LayoutTensor[mut=False, dtype, layout],
     b: LayoutTensor[mut=False, dtype, layout],
-    size: Int,
+    size: UInt,
 ):
     i = block_dim.x * block_idx.x + thread_idx.x
     if i < size:
@@ -36,7 +36,7 @@ fn kernel2[
     output: LayoutTensor[mut=True, dtype, layout],
     a: LayoutTensor[mut=False, dtype, layout],
     b: LayoutTensor[mut=False, dtype, layout],
-    size: Int,
+    size: UInt,
 ):
     tid = block_idx.x * block_dim.x + thread_idx.x
     stride = 512
@@ -57,7 +57,7 @@ fn kernel3[
     output: LayoutTensor[mut=True, dtype, layout],
     a: LayoutTensor[mut=False, dtype, layout],
     b: LayoutTensor[mut=False, dtype, layout],
-    size: Int,
+    size: UInt,
 ):
     tid = block_idx.x * block_dim.x + thread_idx.x
     total_threads = (SIZE // 1024) * 1024
@@ -74,7 +74,7 @@ fn kernel3[
 
 @parameter
 @always_inline
-fn benchmark_kernel1_parameterized[test_size: Int](mut b: Bencher) raises:
+fn benchmark_kernel1_parameterized[test_size: UInt](mut b: Bencher) raises:
     @parameter
     @always_inline
     fn kernel1_workflow(ctx: DeviceContext) raises:
@@ -109,7 +109,7 @@ fn benchmark_kernel1_parameterized[test_size: Int](mut b: Bencher) raises:
 
 @parameter
 @always_inline
-fn benchmark_kernel2_parameterized[test_size: Int](mut b: Bencher) raises:
+fn benchmark_kernel2_parameterized[test_size: UInt](mut b: Bencher) raises:
     @parameter
     @always_inline
     fn kernel2_workflow(ctx: DeviceContext) raises:
diff --git a/problems/p31/p31.mojo b/problems/p31/p31.mojo
index 62bed3f8..f9f29ac7 100644
--- a/problems/p31/p31.mojo
+++ b/problems/p31/p31.mojo
@@ -21,7 +21,7 @@ fn minimal_kernel[
     y: LayoutTensor[mut=True, dtype, layout],
     x: LayoutTensor[mut=False, dtype, layout],
     alpha: Float32,
-    size: Int,
+    size: UInt,
 ):
     """Minimal SAXPY kernel - simple and register-light for high occupancy."""
     i = block_dim.x * block_idx.x + thread_idx.x
@@ -41,7 +41,7 @@ fn sophisticated_kernel[
     y: LayoutTensor[mut=True, dtype, layout],
     x: LayoutTensor[mut=False, dtype, layout],
     alpha: Float32,
-    size: Int,
+    size: UInt,
 ):
     """Sophisticated SAXPY kernel - over-engineered with excessive resource usage.
     """
@@ -138,7 +138,7 @@ fn balanced_kernel[
     y: LayoutTensor[mut=True, dtype, layout],
     x: LayoutTensor[mut=False, dtype, layout],
     alpha: Float32,
-    size: Int,
+    size: UInt,
 ):
     """Balanced SAXPY kernel - efficient optimization with moderate resources.
     """
@@ -191,7 +191,7 @@ fn balanced_kernel[
 
 @parameter
 @always_inline
-fn benchmark_minimal_parameterized[test_size: Int](mut b: Bencher) raises:
+fn benchmark_minimal_parameterized[test_size: UInt](mut b: Bencher) raises:
     @parameter
     @always_inline
     fn minimal_workflow(ctx: DeviceContext) raises:
@@ -224,7 +224,7 @@ fn benchmark_minimal_parameterized[test_size: Int](mut b: Bencher) raises:
 
 @parameter
 @always_inline
-fn benchmark_sophisticated_parameterized[test_size: Int](mut b: Bencher) raises:
+fn benchmark_sophisticated_parameterized[test_size: UInt](mut b: Bencher) raises:
     @parameter
     @always_inline
     fn sophisticated_workflow(ctx: DeviceContext) raises:
@@ -257,7 +257,7 @@ fn benchmark_sophisticated_parameterized[test_size: Int](mut b: Bencher) raises:
 
 @parameter
 @always_inline
-fn benchmark_balanced_parameterized[test_size: Int](mut b: Bencher) raises:
+fn benchmark_balanced_parameterized[test_size: UInt](mut b: Bencher) raises:
     @parameter
     @always_inline
     fn balanced_workflow(ctx: DeviceContext) raises:
diff --git a/problems/p32/p32.mojo b/problems/p32/p32.mojo
index 71e35fe5..ac76c307 100644
--- a/problems/p32/p32.mojo
+++ b/problems/p32/p32.mojo
@@ -20,7 +20,7 @@ fn no_conflict_kernel[
 ](
     output: LayoutTensor[mut=True, dtype, layout],
     input: LayoutTensor[mut=False, dtype, layout],
-    size: Int,
+    size: UInt,
 ):
     """Perfect shared memory access - no bank conflicts.
 
@@ -63,7 +63,7 @@ fn two_way_conflict_kernel[
 ](
     output: LayoutTensor[mut=True, dtype, layout],
     input: LayoutTensor[mut=False, dtype, layout],
-    size: Int,
+    size: UInt,
 ):
     """Stride-2 shared memory access - creates 2-way bank conflicts.
 
@@ -107,7 +107,7 @@ fn two_way_conflict_kernel[
 
 @parameter
 @always_inline
-fn benchmark_no_conflict[test_size: Int](mut b: Bencher) raises:
+fn benchmark_no_conflict[test_size: UInt](mut b: Bencher) raises:
     @parameter
     @always_inline
     fn kernel_workflow(ctx: DeviceContext) raises:
@@ -140,7 +140,7 @@ fn benchmark_no_conflict[test_size: Int](mut b: Bencher) raises:
 
 @parameter
 @always_inline
-fn benchmark_two_way_conflict[test_size: Int](mut b: Bencher) raises:
+fn benchmark_two_way_conflict[test_size: UInt](mut b: Bencher) raises:
     @parameter
     @always_inline
     fn kernel_workflow(ctx: DeviceContext) raises:
diff --git a/problems/p33/p33.mojo b/problems/p33/p33.mojo
index 4cf1b28d..ffe93f6a 100644
--- a/problems/p33/p33.mojo
+++ b/problems/p33/p33.mojo
@@ -23,7 +23,7 @@ alias THREADS_PER_BLOCK_TILED = (TILE_SIZE, TILE_SIZE)
 
 # ANCHOR: matmul_idiomatic_tiled_solution
 fn matmul_idiomatic_tiled[
-    layout: Layout, size: Int
+    layout: Layout, size: UInt
 ](
     output: LayoutTensor[mut=True, dtype, layout],
     a: LayoutTensor[mut=False, dtype, layout],
diff --git a/problems/p34/p34.mojo b/problems/p34/p34.mojo
index 6c9bf308..932cd050 100644
--- a/problems/p34/p34.mojo
+++ b/problems/p34/p34.mojo
@@ -22,11 +22,11 @@ alias out_layout = Layout.row_major(1)
 
 # ANCHOR: cluster_coordination_basics
 fn cluster_coordination_basics[
-    in_layout: Layout, out_layout: Layout, tpb: Int
+    in_layout: Layout, out_layout: Layout, tpb: UInt
 ](
     output: LayoutTensor[mut=True, dtype, out_layout],
     input: LayoutTensor[mut=False, dtype, in_layout],
-    size: Int,
+    size: UInt,
 ):
     """Real cluster coordination using SM90+ cluster APIs."""
     global_i = block_dim.x * block_idx.x + thread_idx.x
@@ -77,12 +77,12 @@ fn cluster_coordination_basics[
 
 # ANCHOR: cluster_collective_operations
 fn cluster_collective_operations[
-    in_layout: Layout, out_layout: Layout, tpb: Int
+    in_layout: Layout, out_layout: Layout, tpb: UInt
 ](
     output: LayoutTensor[mut=True, dtype, out_layout],
     input: LayoutTensor[mut=False, dtype, in_layout],
     temp_storage: LayoutTensor[mut=True, dtype, Layout.row_major(CLUSTER_SIZE)],
-    size: Int,
+    size: UInt,
 ):
     """Cluster-wide collective operations using real cluster APIs."""
     global_i = block_dim.x * block_idx.x + thread_idx.x
@@ -96,11 +96,11 @@ fn cluster_collective_operations[
 
 # ANCHOR: advanced_cluster_patterns
 fn advanced_cluster_patterns[
-    in_layout: Layout, out_layout: Layout, tpb: Int
+    in_layout: Layout, out_layout: Layout, tpb: UInt
 ](
     output: LayoutTensor[mut=True, dtype, out_layout],
     input: LayoutTensor[mut=False, dtype, in_layout],
-    size: Int,
+    size: UInt,
 ):
     """Advanced cluster programming using cluster masks and relaxed synchronization.
     """

From daffcf90f31fc4c31da7389e667beb00c0d72f65 Mon Sep 17 00:00:00 2001
From: raju <raju.ptvs@gmail.com>
Date: Fri, 31 Oct 2025 18:51:30 -0700
Subject: [PATCH 3/4] uint fix

---
 problems/p24/p24.mojo | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/problems/p24/p24.mojo b/problems/p24/p24.mojo
index 332f4eb2..c5584811 100644
--- a/problems/p24/p24.mojo
+++ b/problems/p24/p24.mojo
@@ -206,7 +206,7 @@ fn benchmark_simple_warp_parameterized[
 @parameter
 @always_inline
 fn benchmark_functional_warp_parameterized[
-    test_size: Int
+    test_size: UInt
 ](mut bencher: Bencher) raises:
     alias n_warps = test_size // WARP_SIZE
     alias in_layout = Layout.row_major(test_size)

From 0959976aebceb75f4be3240a71ebbbe030acbba5 Mon Sep 17 00:00:00 2001
From: raju <raju.ptvs@gmail.com>
Date: Sat, 1 Nov 2025 10:07:14 -0700
Subject: [PATCH 4/4] fixed formatting issues

---
 problems/p03/p03.mojo               |  2 +-
 problems/p05/p05_layout_tensor.mojo |  1 +
 problems/p18/op/softmax.mojo        |  2 +-
 problems/p23/p23.mojo               | 10 +++++++---
 problems/p24/p24.mojo               |  2 +-
 problems/p25/p25.mojo               |  2 +-
 problems/p31/p31.mojo               |  4 +++-
 7 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/problems/p03/p03.mojo b/problems/p03/p03.mojo
index b69420d9..861e8a03 100644
--- a/problems/p03/p03.mojo
+++ b/problems/p03/p03.mojo
@@ -17,7 +17,7 @@ fn add_10_guard(
 ):
     i = thread_idx.x
     # FILL ME IN (roughly 2 lines)
-    
+
 
 # ANCHOR_END: add_10_guard
 
diff --git a/problems/p05/p05_layout_tensor.mojo b/problems/p05/p05_layout_tensor.mojo
index 2548e088..99c95655 100644
--- a/problems/p05/p05_layout_tensor.mojo
+++ b/problems/p05/p05_layout_tensor.mojo
@@ -27,6 +27,7 @@ fn broadcast_add[
     col = thread_idx.x
     # FILL ME IN (roughly 2 lines)
 
+
 # ANCHOR_END: broadcast_add_layout_tensor
 def main():
     with DeviceContext() as ctx:
diff --git a/problems/p18/op/softmax.mojo b/problems/p18/op/softmax.mojo
index 8629f162..429c0251 100644
--- a/problems/p18/op/softmax.mojo
+++ b/problems/p18/op/softmax.mojo
@@ -18,7 +18,7 @@ alias BLOCK_DIM_X = 1 << log2_ceil(SIZE)
 
 
 fn softmax_gpu_kernel[
-    layout: Layout, 
+    layout: Layout,
     input_size: UInt,
     dtype: DType = DType.float32,
 ](
diff --git a/problems/p23/p23.mojo b/problems/p23/p23.mojo
index db9f9454..40de6055 100644
--- a/problems/p23/p23.mojo
+++ b/problems/p23/p23.mojo
@@ -79,7 +79,7 @@ fn tiled_elementwise_add[
 
 # ANCHOR: manual_vectorized_tiled_elementwise_add
 fn manual_vectorized_tiled_elementwise_add[
-    layout: Layout, 
+    layout: Layout,
     dtype: DType,
     simd_width: UInt,
     num_threads_per_tile: UInt,
@@ -98,7 +98,9 @@ fn manual_vectorized_tiled_elementwise_add[
     @parameter
     @always_inline
     fn process_manual_vectorized_tiles[
-        num_threads_per_tile: UInt, rank: UInt, alignment: UInt = align_of[dtype]()
+        num_threads_per_tile: UInt,
+        rank: UInt,
+        alignment: UInt = align_of[dtype](),
     ](indices: IndexList[rank]) capturing -> None:
         tile_id = indices[0]
         print("tile_id:", tile_id)
@@ -137,7 +139,9 @@ fn vectorize_within_tiles_elementwise_add[
     @parameter
     @always_inline
     fn process_tile_with_vectorize[
-        num_threads_per_tile: UInt, rank: UInt, alignment: UInt = align_of[dtype]()
+        num_threads_per_tile: UInt,
+        rank: UInt,
+        alignment: UInt = align_of[dtype](),
     ](indices: IndexList[rank]) capturing -> None:
         tile_id = indices[0]
         tile_start = tile_id * tile_size
diff --git a/problems/p24/p24.mojo b/problems/p24/p24.mojo
index c5584811..6ebd546e 100644
--- a/problems/p24/p24.mojo
+++ b/problems/p24/p24.mojo
@@ -135,7 +135,7 @@ fn expected_output[
 
 
 fn rand_int[
-    dtype: DType, size: UInt 
+    dtype: DType, size: UInt
 ](buff: DeviceBuffer[dtype], min: Int = 0, max: Int = 100) raises:
     with buff.map_to_host() as buff_host:
         for i in range(size):
diff --git a/problems/p25/p25.mojo b/problems/p25/p25.mojo
index 8aaa74a1..812856fd 100644
--- a/problems/p25/p25.mojo
+++ b/problems/p25/p25.mojo
@@ -65,7 +65,7 @@ fn broadcast_shuffle_coordination[
 ](
     output: LayoutTensor[mut=True, dtype, layout],
     input: LayoutTensor[mut=False, dtype, layout],
-):  
+):
     """
     Combine broadcast() and shuffle_down() for advanced warp coordination.
     Lane 0 computes block-local scaling factor, broadcasts it to all lanes in the warp.
diff --git a/problems/p31/p31.mojo b/problems/p31/p31.mojo
index f9f29ac7..ed747e64 100644
--- a/problems/p31/p31.mojo
+++ b/problems/p31/p31.mojo
@@ -224,7 +224,9 @@ fn benchmark_minimal_parameterized[test_size: UInt](mut b: Bencher) raises:
 
 @parameter
 @always_inline
-fn benchmark_sophisticated_parameterized[test_size: UInt](mut b: Bencher) raises:
+fn benchmark_sophisticated_parameterized[
+    test_size: UInt
+](mut b: Bencher) raises:
     @parameter
     @always_inline
     fn sophisticated_workflow(ctx: DeviceContext) raises: