From 5cf59d79c092c84cde294facc6c9295a1bffe6a0 Mon Sep 17 00:00:00 2001 From: raju Date: Fri, 31 Oct 2025 18:49:21 -0700 Subject: [PATCH 1/4] uint init fix --- problems/p03/p03.mojo | 4 ++-- problems/p04/p04.mojo | 2 +- problems/p04/p04_layout_tensor.mojo | 2 +- problems/p05/p05.mojo | 2 +- problems/p05/p05_layout_tensor.mojo | 3 +-- problems/p06/p06.mojo | 2 +- problems/p07/p07_layout_tensor.mojo | 2 +- 7 files changed, 8 insertions(+), 9 deletions(-) diff --git a/problems/p03/p03.mojo b/problems/p03/p03.mojo index 474489c5..b69420d9 100644 --- a/problems/p03/p03.mojo +++ b/problems/p03/p03.mojo @@ -13,11 +13,11 @@ alias dtype = DType.float32 fn add_10_guard( output: UnsafePointer[Scalar[dtype]], a: UnsafePointer[Scalar[dtype]], - size: Int, + size: UInt, ): i = thread_idx.x # FILL ME IN (roughly 2 lines) - + # ANCHOR_END: add_10_guard diff --git a/problems/p04/p04.mojo b/problems/p04/p04.mojo index 2a954400..c47d89e4 100644 --- a/problems/p04/p04.mojo +++ b/problems/p04/p04.mojo @@ -13,7 +13,7 @@ alias dtype = DType.float32 fn add_10_2d( output: UnsafePointer[Scalar[dtype]], a: UnsafePointer[Scalar[dtype]], - size: Int, + size: UInt, ): row = thread_idx.y col = thread_idx.x diff --git a/problems/p04/p04_layout_tensor.mojo b/problems/p04/p04_layout_tensor.mojo index 01e4b3f3..536312db 100644 --- a/problems/p04/p04_layout_tensor.mojo +++ b/problems/p04/p04_layout_tensor.mojo @@ -14,7 +14,7 @@ alias layout = Layout.row_major(SIZE, SIZE) fn add_10_2d( output: LayoutTensor[mut=True, dtype, layout], a: LayoutTensor[mut=True, dtype, layout], - size: Int, + size: UInt, ): row = thread_idx.y col = thread_idx.x diff --git a/problems/p05/p05.mojo b/problems/p05/p05.mojo index 37e8aa83..aeec5cb9 100644 --- a/problems/p05/p05.mojo +++ b/problems/p05/p05.mojo @@ -14,7 +14,7 @@ fn broadcast_add( output: UnsafePointer[Scalar[dtype]], a: UnsafePointer[Scalar[dtype]], b: UnsafePointer[Scalar[dtype]], - size: Int, + size: UInt, ): row = thread_idx.y col = thread_idx.x diff --git a/problems/p05/p05_layout_tensor.mojo b/problems/p05/p05_layout_tensor.mojo index 42fee181..2548e088 100644 --- a/problems/p05/p05_layout_tensor.mojo +++ b/problems/p05/p05_layout_tensor.mojo @@ -21,13 +21,12 @@ fn broadcast_add[ output: LayoutTensor[mut=True, dtype, out_layout], a: LayoutTensor[mut=False, dtype, a_layout], b: LayoutTensor[mut=False, dtype, b_layout], - size: Int, + size: UInt, ): row = thread_idx.y col = thread_idx.x # FILL ME IN (roughly 2 lines) - # ANCHOR_END: broadcast_add_layout_tensor def main(): with DeviceContext() as ctx: diff --git a/problems/p06/p06.mojo b/problems/p06/p06.mojo index c679b21a..f55e0733 100644 --- a/problems/p06/p06.mojo +++ b/problems/p06/p06.mojo @@ -13,7 +13,7 @@ alias dtype = DType.float32 fn add_10_blocks( output: UnsafePointer[Scalar[dtype]], a: UnsafePointer[Scalar[dtype]], - size: Int, + size: UInt, ): i = block_dim.x * block_idx.x + thread_idx.x # FILL ME IN (roughly 2 lines) diff --git a/problems/p07/p07_layout_tensor.mojo b/problems/p07/p07_layout_tensor.mojo index 8f939fe4..0679d0d1 100644 --- a/problems/p07/p07_layout_tensor.mojo +++ b/problems/p07/p07_layout_tensor.mojo @@ -18,7 +18,7 @@ fn add_10_blocks_2d[ ]( output: LayoutTensor[mut=True, dtype, out_layout], a: LayoutTensor[mut=False, dtype, a_layout], - size: Int, + size: UInt, ): row = block_dim.y * block_idx.y + thread_idx.y col = block_dim.x * block_idx.x + thread_idx.x From dc34ebe70383c28821e39b505ea3d433f8ad51fc Mon Sep 17 00:00:00 2001 From: raju Date: Fri, 31 Oct 2025 18:50:58 -0700 Subject: [PATCH 2/4] changed Int to UInt --- problems/p07/p07.mojo | 2 +- problems/p08/p08.mojo | 2 +- problems/p08/p08_layout_tensor.mojo | 2 +- problems/p10/p10.mojo | 4 +-- problems/p11/p11.mojo | 2 +- problems/p11/p11_layout_tensor.mojo | 2 +- problems/p12/p12.mojo | 2 +- problems/p12/p12_layout_tensor.mojo | 2 +- problems/p14/p14.mojo | 4 +-- problems/p15/p15.mojo | 2 +- problems/p17/op/conv1d.mojo | 8 ++--- problems/p18/op/softmax.mojo | 8 ++--- problems/p19/op/attention.mojo | 20 +++++------ problems/p20/op/conv1d.mojo | 4 +-- problems/p21/op/embedding.mojo | 32 ++++++++--------- problems/p22/op/layernorm_linear.mojo | 52 +++++++++++++-------------- problems/p23/p23.mojo | 46 ++++++++++++------------ problems/p24/p24.mojo | 20 +++++------ problems/p25/p25.mojo | 12 +++---- problems/p26/p26.mojo | 10 +++--- problems/p27/p27.mojo | 14 ++++---- problems/p29/p29.mojo | 4 +-- problems/p30/p30.mojo | 10 +++--- problems/p31/p31.mojo | 12 +++---- problems/p32/p32.mojo | 8 ++--- problems/p33/p33.mojo | 2 +- problems/p34/p34.mojo | 12 +++---- 27 files changed, 149 insertions(+), 149 deletions(-) diff --git a/problems/p07/p07.mojo b/problems/p07/p07.mojo index 09db5cd1..335b2c25 100644 --- a/problems/p07/p07.mojo +++ b/problems/p07/p07.mojo @@ -13,7 +13,7 @@ alias dtype = DType.float32 fn add_10_blocks_2d( output: UnsafePointer[Scalar[dtype]], a: UnsafePointer[Scalar[dtype]], - size: Int, + size: UInt, ): row = block_dim.y * block_idx.y + thread_idx.y col = block_dim.x * block_idx.x + thread_idx.x diff --git a/problems/p08/p08.mojo b/problems/p08/p08.mojo index dd74f555..60b1043e 100644 --- a/problems/p08/p08.mojo +++ b/problems/p08/p08.mojo @@ -16,7 +16,7 @@ alias dtype = DType.float32 fn add_10_shared( output: UnsafePointer[Scalar[dtype]], a: UnsafePointer[Scalar[dtype]], - size: Int, + size: UInt, ): shared = stack_allocation[ TPB, diff --git a/problems/p08/p08_layout_tensor.mojo b/problems/p08/p08_layout_tensor.mojo index a6fce741..fc817376 100644 --- a/problems/p08/p08_layout_tensor.mojo +++ b/problems/p08/p08_layout_tensor.mojo @@ -19,7 +19,7 @@ fn add_10_shared_layout_tensor[ ]( output: LayoutTensor[mut=True, dtype, layout], a: LayoutTensor[mut=True, dtype, layout], - size: Int, + size: UInt, ): # Allocate shared memory using LayoutTensor with explicit address_space shared = LayoutTensor[ diff --git a/problems/p10/p10.mojo b/problems/p10/p10.mojo index 1f9fdea4..205a4867 100644 --- a/problems/p10/p10.mojo +++ b/problems/p10/p10.mojo @@ -17,7 +17,7 @@ alias layout = Layout.row_major(SIZE, SIZE) fn shared_memory_race( output: LayoutTensor[mut=True, dtype, layout], a: LayoutTensor[mut=False, dtype, layout], - size: Int, + size: UInt, ): row = thread_idx.y col = thread_idx.x @@ -45,7 +45,7 @@ fn shared_memory_race( fn add_10_2d( output: LayoutTensor[mut=True, dtype, layout], a: LayoutTensor[mut=True, dtype, layout], - size: Int, + size: UInt, ): row = thread_idx.y col = thread_idx.x diff --git a/problems/p11/p11.mojo b/problems/p11/p11.mojo index 62d3c6d1..064bde95 100644 --- a/problems/p11/p11.mojo +++ b/problems/p11/p11.mojo @@ -16,7 +16,7 @@ alias dtype = DType.float32 fn pooling( output: UnsafePointer[Scalar[dtype]], a: UnsafePointer[Scalar[dtype]], - size: Int, + size: UInt, ): shared = stack_allocation[ TPB, diff --git a/problems/p11/p11_layout_tensor.mojo b/problems/p11/p11_layout_tensor.mojo index b24d2f81..d6c9e62d 100644 --- a/problems/p11/p11_layout_tensor.mojo +++ b/problems/p11/p11_layout_tensor.mojo @@ -18,7 +18,7 @@ fn pooling[ ]( output: LayoutTensor[mut=True, dtype, layout], a: LayoutTensor[mut=True, dtype, layout], - size: Int, + size: UInt, ): # Allocate shared memory using tensor builder shared = LayoutTensor[ diff --git a/problems/p12/p12.mojo b/problems/p12/p12.mojo index 690230cc..b659998e 100644 --- a/problems/p12/p12.mojo +++ b/problems/p12/p12.mojo @@ -17,7 +17,7 @@ fn dot_product( output: UnsafePointer[Scalar[dtype]], a: UnsafePointer[Scalar[dtype]], b: UnsafePointer[Scalar[dtype]], - size: Int, + size: UInt, ): # FILL ME IN (roughly 13 lines) ... diff --git a/problems/p12/p12_layout_tensor.mojo b/problems/p12/p12_layout_tensor.mojo index e94c32ee..9ac437d1 100644 --- a/problems/p12/p12_layout_tensor.mojo +++ b/problems/p12/p12_layout_tensor.mojo @@ -21,7 +21,7 @@ fn dot_product[ output: LayoutTensor[mut=True, dtype, out_layout], a: LayoutTensor[mut=True, dtype, in_layout], b: LayoutTensor[mut=True, dtype, in_layout], - size: Int, + size: UInt, ): # FILL ME IN (roughly 13 lines) ... diff --git a/problems/p14/p14.mojo b/problems/p14/p14.mojo index 4a9e72a7..0a403ac7 100644 --- a/problems/p14/p14.mojo +++ b/problems/p14/p14.mojo @@ -20,7 +20,7 @@ fn prefix_sum_simple[ ]( output: LayoutTensor[mut=True, dtype, layout], a: LayoutTensor[mut=False, dtype, layout], - size: Int, + size: UInt, ): global_i = block_dim.x * block_idx.x + thread_idx.x local_i = thread_idx.x @@ -43,7 +43,7 @@ fn prefix_sum_local_phase[ ]( output: LayoutTensor[mut=True, dtype, out_layout], a: LayoutTensor[mut=False, dtype, in_layout], - size: Int, + size: UInt, ): global_i = block_dim.x * block_idx.x + thread_idx.x local_i = thread_idx.x diff --git a/problems/p15/p15.mojo b/problems/p15/p15.mojo index fc243d9b..ee82fc34 100644 --- a/problems/p15/p15.mojo +++ b/problems/p15/p15.mojo @@ -23,7 +23,7 @@ fn axis_sum[ ]( output: LayoutTensor[mut=True, dtype, out_layout], a: LayoutTensor[mut=False, dtype, in_layout], - size: Int, + size: UInt, ): global_i = block_dim.x * block_idx.x + thread_idx.x local_i = thread_idx.x diff --git a/problems/p17/op/conv1d.mojo b/problems/p17/op/conv1d.mojo index f0bcd5c9..2fad3a2c 100644 --- a/problems/p17/op/conv1d.mojo +++ b/problems/p17/op/conv1d.mojo @@ -12,8 +12,8 @@ fn conv1d_kernel[ in_layout: Layout, out_layout: Layout, conv_layout: Layout, - input_size: Int, - conv_size: Int, + input_size: UInt, + conv_size: UInt, dtype: DType = DType.float32, ]( output: LayoutTensor[mut=True, dtype, out_layout], @@ -80,8 +80,8 @@ struct Conv1DCustomOp: fn execute[ # The kind of device this will be run on: "cpu" or "gpu" target: StaticString, - input_size: Int, - conv_size: Int, + input_size: UInt, + conv_size: UInt, dtype: DType = DType.float32, ]( output: OutputTensor[rank=1], diff --git a/problems/p18/op/softmax.mojo b/problems/p18/op/softmax.mojo index d4d5621e..8629f162 100644 --- a/problems/p18/op/softmax.mojo +++ b/problems/p18/op/softmax.mojo @@ -18,8 +18,8 @@ alias BLOCK_DIM_X = 1 << log2_ceil(SIZE) fn softmax_gpu_kernel[ - layout: Layout, - input_size: Int, + layout: Layout, + input_size: UInt, dtype: DType = DType.float32, ]( output: LayoutTensor[mut=True, dtype, layout], @@ -35,7 +35,7 @@ fn softmax_gpu_kernel[ # ANCHOR: softmax_cpu_kernel fn softmax_cpu_kernel[ layout: Layout, - input_size: Int, + input_size: UInt, dtype: DType = DType.float32, ]( output: LayoutTensor[dtype, layout, MutableAnyOrigin], @@ -57,7 +57,7 @@ struct SoftmaxCustomOp: @staticmethod fn execute[ target: StaticString, # "cpu" or "gpu" - input_size: Int, + input_size: UInt, dtype: DType = DType.float32, ]( output: OutputTensor[rank=1], diff --git a/problems/p19/op/attention.mojo b/problems/p19/op/attention.mojo index ceffc72c..0f277cb8 100644 --- a/problems/p19/op/attention.mojo +++ b/problems/p19/op/attention.mojo @@ -32,9 +32,9 @@ fn matmul_idiomatic_tiled[ a_layout: Layout, b_layout: Layout, out_layout: Layout, - rows: Int, - cols: Int, - inner: Int, + rows: UInt, + cols: UInt, + inner: UInt, dtype: DType = DType.float32, ]( output: LayoutTensor[mut=True, dtype, out_layout, MutableAnyOrigin], @@ -120,8 +120,8 @@ fn matmul_idiomatic_tiled[ fn transpose_kernel[ layout_in: Layout, # Layout for input matrix (seq_len, d) layout_out: Layout, # Layout for output matrix (d, seq_len) - rows: Int, - cols: Int, + rows: UInt, + cols: UInt, dtype: DType = DType.float32, ]( output: LayoutTensor[mut=True, dtype, layout_out, MutableAnyOrigin], @@ -137,7 +137,7 @@ fn transpose_kernel[ # Apply softmax to attention scores taken from p16 fn softmax_gpu_kernel[ layout: Layout, - input_size: Int, + input_size: UInt, dtype: DType = DType.float32, ]( output: LayoutTensor[mut=True, dtype, layout], @@ -209,8 +209,8 @@ fn attention_cpu_kernel[ layout_k: Layout, layout_v: Layout, layout_out: Layout, - seq_len: Int, - d: Int, + seq_len: UInt, + d: UInt, dtype: DType = DType.float32, ]( output: LayoutTensor[dtype, layout_out, MutableAnyOrigin], @@ -259,8 +259,8 @@ struct AttentionCustomOp: @staticmethod fn execute[ target: StaticString, # "cpu" or "gpu" - seq_len: Int, - d: Int, + seq_len: UInt, + d: UInt, dtype: DType = DType.float32, ]( output: OutputTensor[rank=1], # Output vector (d,) diff --git a/problems/p20/op/conv1d.mojo b/problems/p20/op/conv1d.mojo index b03d972a..d506f5ed 100644 --- a/problems/p20/op/conv1d.mojo +++ b/problems/p20/op/conv1d.mojo @@ -15,8 +15,8 @@ fn conv1d_kernel[ in_layout: Layout, out_layout: Layout, conv_layout: Layout, - input_size: Int, - conv_size: Int, + input_size: UInt, + conv_size: UInt, dtype: DType = DType.float32, ]( output: LayoutTensor[mut=True, dtype, out_layout], diff --git a/problems/p21/op/embedding.mojo b/problems/p21/op/embedding.mojo index bc650460..5db57af5 100644 --- a/problems/p21/op/embedding.mojo +++ b/problems/p21/op/embedding.mojo @@ -13,10 +13,10 @@ fn embedding_kernel_coalesced[ indices_layout: Layout, weights_layout: Layout, out_layout: Layout, - batch_size: Int, - seq_len: Int, - vocab_size: Int, - embed_dim: Int, + batch_size: UInt, + seq_len: UInt, + vocab_size: UInt, + embed_dim: UInt, dtype: DType = DType.float32, ]( output: LayoutTensor[mut=True, dtype, out_layout], @@ -57,10 +57,10 @@ fn embedding_kernel_2d[ indices_layout: Layout, weights_layout: Layout, out_layout: Layout, - batch_size: Int, - seq_len: Int, - vocab_size: Int, - embed_dim: Int, + batch_size: UInt, + seq_len: UInt, + vocab_size: UInt, + embed_dim: UInt, dtype: DType = DType.float32, ]( output: LayoutTensor[mut=True, dtype, out_layout], @@ -108,10 +108,10 @@ struct EmbeddingCustomOp: @staticmethod fn execute[ target: StaticString, - batch_size: Int, - seq_len: Int, - vocab_size: Int, - embed_dim: Int, + batch_size: UInt, + seq_len: UInt, + vocab_size: UInt, + embed_dim: UInt, ]( output: OutputTensor[ dtype = DType.float32, rank=3 @@ -194,10 +194,10 @@ struct Embedding2DCustomOp: @staticmethod fn execute[ target: StaticString, - batch_size: Int, - seq_len: Int, - vocab_size: Int, - embed_dim: Int, + batch_size: UInt, + seq_len: UInt, + vocab_size: UInt, + embed_dim: UInt, ]( output: OutputTensor[ dtype = DType.float32, rank=3 diff --git a/problems/p22/op/layernorm_linear.mojo b/problems/p22/op/layernorm_linear.mojo index 3d0fedda..7a798cd1 100644 --- a/problems/p22/op/layernorm_linear.mojo +++ b/problems/p22/op/layernorm_linear.mojo @@ -23,9 +23,9 @@ fn matmul_idiomatic_tiled[ a_layout: Layout, b_layout: Layout, out_layout: Layout, - rows: Int, - cols: Int, - inner: Int, + rows: UInt, + cols: UInt, + inner: UInt, dtype: DType = DType.float32, ]( output: LayoutTensor[mut=True, dtype, out_layout, MutableAnyOrigin], @@ -115,9 +115,9 @@ fn layernorm_kernel[ input_layout: Layout, ln_params_layout: Layout, output_layout: Layout, - batch_size: Int, - seq_len: Int, - hidden_dim: Int, + batch_size: UInt, + seq_len: UInt, + hidden_dim: UInt, ]( output: LayoutTensor[mut=True, dtype, output_layout], input: LayoutTensor[mut=False, dtype, input_layout], @@ -149,8 +149,8 @@ fn layernorm_kernel[ fn transpose_kernel[ layout_in: Layout, layout_out: Layout, - rows: Int, - cols: Int, + rows: UInt, + cols: UInt, dtype: DType = DType.float32, ]( output: LayoutTensor[mut=True, dtype, layout_out, MutableAnyOrigin], @@ -194,8 +194,8 @@ fn add_bias_kernel[ input_layout: Layout, bias_layout: Layout, output_layout: Layout, - batch_size: Int, - seq_len: Int, + batch_size: UInt, + seq_len: UInt, output_dim: Int, ]( output: LayoutTensor[mut=True, dtype, output_layout], @@ -225,10 +225,10 @@ fn minimal_fused_kernel[ weight_layout: Layout, bias_layout: Layout, output_layout: Layout, - batch_size: Int, - seq_len: Int, - hidden_dim: Int, - output_dim: Int, + batch_size: UInt, + seq_len: UInt, + hidden_dim: UInt, + output_dim: UInt, ]( output: LayoutTensor[mut=True, dtype, output_layout], input: LayoutTensor[mut=False, dtype, input_layout], @@ -270,10 +270,10 @@ fn minimal_fused_kernel_backward[ grad_ln_bias_layout: Layout, grad_weight_layout: Layout, grad_bias_layout: Layout, - batch_size: Int, - seq_len: Int, - hidden_dim: Int, - output_dim: Int, + batch_size: UInt, + seq_len: UInt, + hidden_dim: UInt, + output_dim: UInt, ]( grad_input: LayoutTensor[mut=True, dtype, grad_input_layout], grad_ln_weight: LayoutTensor[mut=True, dtype, grad_ln_weight_layout], @@ -335,10 +335,10 @@ struct LayerNormLinearCustomOp: fn execute[ target: StaticString, algorithm: StaticString, - batch_size: Int, - seq_len: Int, - hidden_dim: Int, - output_dim: Int, + batch_size: UInt, + seq_len: UInt, + hidden_dim: UInt, + output_dim: UInt, ]( output: OutputTensor[dtype = DType.float32, rank=3], input: InputTensor[dtype = DType.float32, rank=3], @@ -557,10 +557,10 @@ struct LayerNormLinearBackwardCustomOp: @staticmethod fn execute[ target: StaticString, - batch_size: Int, - seq_len: Int, - hidden_dim: Int, - output_dim: Int, + batch_size: UInt, + seq_len: UInt, + hidden_dim: UInt, + output_dim: UInt, ]( grad_input: OutputTensor[dtype = DType.float32, rank=3], grad_ln_weight: OutputTensor[dtype = DType.float32, rank=1], diff --git a/problems/p23/p23.mojo b/problems/p23/p23.mojo index 67b36029..db9f9454 100644 --- a/problems/p23/p23.mojo +++ b/problems/p23/p23.mojo @@ -18,7 +18,7 @@ alias SIMD_WIDTH = simd_width_of[dtype, target = get_gpu_target()]() fn elementwise_add[ - layout: Layout, dtype: DType, simd_width: Int, rank: Int, size: Int + layout: Layout, dtype: DType, simd_width: UInt, rank: UInt, size: UInt ]( output: LayoutTensor[mut=True, dtype, layout, MutableAnyOrigin], a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin], @@ -47,10 +47,10 @@ alias TILE_SIZE = 32 fn tiled_elementwise_add[ layout: Layout, dtype: DType, - simd_width: Int, - rank: Int, - size: Int, - tile_size: Int, + simd_width: UInt, + rank: UInt, + size: UInt, + tile_size: UInt, ]( output: LayoutTensor[mut=True, dtype, layout, MutableAnyOrigin], a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin], @@ -60,7 +60,7 @@ fn tiled_elementwise_add[ @parameter @always_inline fn process_tiles[ - simd_width: Int, rank: Int, alignment: Int = align_of[dtype]() + simd_width: UInt, rank: UInt, alignment: Int = align_of[dtype]() ](indices: IndexList[rank]) capturing -> None: tile_id = indices[0] print("tile_id:", tile_id) @@ -79,13 +79,13 @@ fn tiled_elementwise_add[ # ANCHOR: manual_vectorized_tiled_elementwise_add fn manual_vectorized_tiled_elementwise_add[ - layout: Layout, + layout: Layout, dtype: DType, - simd_width: Int, - num_threads_per_tile: Int, - rank: Int, - size: Int, - tile_size: Int, + simd_width: UInt, + num_threads_per_tile: UInt, + rank: UInt, + size: UInt, + tile_size: UInt, ]( output: LayoutTensor[mut=True, dtype, layout, MutableAnyOrigin], a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin], @@ -98,7 +98,7 @@ fn manual_vectorized_tiled_elementwise_add[ @parameter @always_inline fn process_manual_vectorized_tiles[ - num_threads_per_tile: Int, rank: Int, alignment: Int = align_of[dtype]() + num_threads_per_tile: UInt, rank: UInt, alignment: UInt = align_of[dtype]() ](indices: IndexList[rank]) capturing -> None: tile_id = indices[0] print("tile_id:", tile_id) @@ -122,11 +122,11 @@ fn manual_vectorized_tiled_elementwise_add[ fn vectorize_within_tiles_elementwise_add[ layout: Layout, dtype: DType, - simd_width: Int, - num_threads_per_tile: Int, - rank: Int, - size: Int, - tile_size: Int, + simd_width: UInt, + num_threads_per_tile: UInt, + rank: UInt, + size: UInt, + tile_size: UInt, ]( output: LayoutTensor[mut=True, dtype, layout, MutableAnyOrigin], a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin], @@ -137,7 +137,7 @@ fn vectorize_within_tiles_elementwise_add[ @parameter @always_inline fn process_tile_with_vectorize[ - num_threads_per_tile: Int, rank: Int, alignment: Int = align_of[dtype]() + num_threads_per_tile: UInt, rank: UInt, alignment: UInt = align_of[dtype]() ](indices: IndexList[rank]) capturing -> None: tile_id = indices[0] tile_start = tile_id * tile_size @@ -168,7 +168,7 @@ fn vectorize_within_tiles_elementwise_add[ @parameter @always_inline fn benchmark_elementwise_parameterized[ - test_size: Int, tile_size: Int + test_size: UInt, tile_size: UInt ](mut b: Bencher) raises: bench_ctx = DeviceContext() alias layout = Layout.row_major(test_size) @@ -206,7 +206,7 @@ fn benchmark_elementwise_parameterized[ @parameter @always_inline fn benchmark_tiled_parameterized[ - test_size: Int, tile_size: Int + test_size: UInt, tile_size: UInt ](mut b: Bencher) raises: bench_ctx = DeviceContext() alias layout = Layout.row_major(test_size) @@ -238,7 +238,7 @@ fn benchmark_tiled_parameterized[ @parameter @always_inline fn benchmark_manual_vectorized_parameterized[ - test_size: Int, tile_size: Int + test_size: UInt, tile_size: UInt ](mut b: Bencher) raises: bench_ctx = DeviceContext() alias layout = Layout.row_major(test_size) @@ -270,7 +270,7 @@ fn benchmark_manual_vectorized_parameterized[ @parameter @always_inline fn benchmark_vectorized_parameterized[ - test_size: Int, tile_size: Int + test_size: UInt, tile_size: UInt ](mut b: Bencher) raises: bench_ctx = DeviceContext() alias layout = Layout.row_major(test_size) diff --git a/problems/p24/p24.mojo b/problems/p24/p24.mojo index a0531f05..332f4eb2 100644 --- a/problems/p24/p24.mojo +++ b/problems/p24/p24.mojo @@ -32,7 +32,7 @@ alias out_layout = Layout.row_major(1) fn traditional_dot_product_p12_style[ - in_layout: Layout, out_layout: Layout, size: Int + in_layout: Layout, out_layout: Layout, size: UInt ]( output: LayoutTensor[mut=True, dtype, out_layout], a: LayoutTensor[mut=False, dtype, in_layout], @@ -73,7 +73,7 @@ fn traditional_dot_product_p12_style[ # ANCHOR: simple_warp_kernel fn simple_warp_dot_product[ - in_layout: Layout, out_layout: Layout, size: Int + in_layout: Layout, out_layout: Layout, size: UInt ]( output: LayoutTensor[mut=True, dtype, out_layout], a: LayoutTensor[mut=False, dtype, in_layout], @@ -91,9 +91,9 @@ fn functional_warp_dot_product[ layout: Layout, out_layout: Layout, dtype: DType, - simd_width: Int, - rank: Int, - size: Int, + simd_width: UInt, + rank: UInt, + size: UInt, ]( output: LayoutTensor[mut=True, dtype, out_layout, MutableAnyOrigin], a: LayoutTensor[mut=False, dtype, layout, MutableAnyOrigin], @@ -103,7 +103,7 @@ fn functional_warp_dot_product[ @parameter @always_inline fn compute_dot_product[ - simd_width: Int, rank: Int, alignment: Int = align_of[dtype]() + simd_width: UInt, rank: UInt, alignment: UInt = align_of[dtype]() ](indices: IndexList[rank]) capturing -> None: idx = indices[0] print("idx:", idx) @@ -117,7 +117,7 @@ fn functional_warp_dot_product[ fn expected_output[ - dtype: DType, n_warps: Int + dtype: DType, n_warps: UInt ]( expected: HostBuffer[dtype], a: DeviceBuffer[dtype], @@ -135,7 +135,7 @@ fn expected_output[ fn rand_int[ - dtype: DType, size: Int + dtype: DType, size: UInt ](buff: DeviceBuffer[dtype], min: Int = 0, max: Int = 100) raises: with buff.map_to_host() as buff_host: for i in range(size): @@ -143,7 +143,7 @@ fn rand_int[ fn check_result[ - dtype: DType, size: Int, print_result: Bool = False + dtype: DType, size: UInt, print_result: Bool = False ](actual: DeviceBuffer[dtype], expected: HostBuffer[dtype]) raises: with actual.map_to_host() as actual_host: if print_result: @@ -157,7 +157,7 @@ fn check_result[ @parameter @always_inline fn benchmark_simple_warp_parameterized[ - test_size: Int + test_size: UInt ](mut bencher: Bencher) raises: alias n_warps = test_size // WARP_SIZE alias in_layout = Layout.row_major(test_size) diff --git a/problems/p25/p25.mojo b/problems/p25/p25.mojo index 0e038521..8aaa74a1 100644 --- a/problems/p25/p25.mojo +++ b/problems/p25/p25.mojo @@ -14,7 +14,7 @@ alias layout = Layout.row_major(SIZE) fn neighbor_difference[ - layout: Layout, size: Int + layout: Layout, size: UInt ]( output: LayoutTensor[mut=True, dtype, layout], input: LayoutTensor[mut=False, dtype, layout], @@ -40,7 +40,7 @@ alias layout_2 = Layout.row_major(SIZE_2) fn moving_average_3[ - layout: Layout, size: Int + layout: Layout, size: UInt ]( output: LayoutTensor[mut=True, dtype, layout], input: LayoutTensor[mut=False, dtype, layout], @@ -61,11 +61,11 @@ fn moving_average_3[ # ANCHOR: broadcast_shuffle_coordination fn broadcast_shuffle_coordination[ - layout: Layout, size: Int + layout: Layout, size: UInt ]( output: LayoutTensor[mut=True, dtype, layout], input: LayoutTensor[mut=False, dtype, layout], -): +): """ Combine broadcast() and shuffle_down() for advanced warp coordination. Lane 0 computes block-local scaling factor, broadcasts it to all lanes in the warp. @@ -84,7 +84,7 @@ fn broadcast_shuffle_coordination[ # ANCHOR: basic_broadcast fn basic_broadcast[ - layout: Layout, size: Int + layout: Layout, size: UInt ]( output: LayoutTensor[mut=True, dtype, layout], input: LayoutTensor[mut=False, dtype, layout], @@ -106,7 +106,7 @@ fn basic_broadcast[ # ANCHOR: conditional_broadcast fn conditional_broadcast[ - layout: Layout, size: Int + layout: Layout, size: UInt ]( output: LayoutTensor[mut=True, dtype, layout], input: LayoutTensor[mut=False, dtype, layout], diff --git a/problems/p26/p26.mojo b/problems/p26/p26.mojo index a529d01c..57b855a0 100644 --- a/problems/p26/p26.mojo +++ b/problems/p26/p26.mojo @@ -14,7 +14,7 @@ alias layout = Layout.row_major(SIZE) fn butterfly_pair_swap[ - layout: Layout, size: Int + layout: Layout, size: UInt ]( output: LayoutTensor[mut=True, dtype, layout], input: LayoutTensor[mut=False, dtype, layout], @@ -35,7 +35,7 @@ fn butterfly_pair_swap[ # ANCHOR: butterfly_parallel_max fn butterfly_parallel_max[ - layout: Layout, size: Int + layout: Layout, size: UInt ]( output: LayoutTensor[mut=True, dtype, layout], input: LayoutTensor[mut=False, dtype, layout], @@ -63,7 +63,7 @@ alias layout_2 = Layout.row_major(SIZE_2) fn butterfly_conditional_max[ - layout: Layout, size: Int + layout: Layout, size: UInt ]( output: LayoutTensor[mut=True, dtype, layout], input: LayoutTensor[mut=False, dtype, layout], @@ -88,7 +88,7 @@ fn butterfly_conditional_max[ # ANCHOR: warp_inclusive_prefix_sum fn warp_inclusive_prefix_sum[ - layout: Layout, size: Int + layout: Layout, size: UInt ]( output: LayoutTensor[mut=True, dtype, layout], input: LayoutTensor[mut=False, dtype, layout], @@ -123,7 +123,7 @@ fn warp_inclusive_prefix_sum[ # ANCHOR: warp_partition fn warp_partition[ - layout: Layout, size: Int + layout: Layout, size: UInt ]( output: LayoutTensor[mut=True, dtype, layout], input: LayoutTensor[mut=False, dtype, layout], diff --git a/problems/p27/p27.mojo b/problems/p27/p27.mojo index b21efd7e..4da14fda 100644 --- a/problems/p27/p27.mojo +++ b/problems/p27/p27.mojo @@ -12,12 +12,12 @@ from math import floor # ANCHOR: traditional_dot_product fn traditional_dot_product[ - in_layout: Layout, out_layout: Layout, tpb: Int + in_layout: Layout, out_layout: Layout, tpb: UInt ]( output: LayoutTensor[mut=True, dtype, out_layout], a: LayoutTensor[mut=False, dtype, in_layout], b: LayoutTensor[mut=False, dtype, in_layout], - size: Int, + size: UInt, ): """Traditional dot product using shared memory + barriers + tree reduction. Educational but complex - shows the manual coordination needed.""" @@ -64,12 +64,12 @@ alias dtype = DType.float32 fn block_sum_dot_product[ - in_layout: Layout, out_layout: Layout, tpb: Int + in_layout: Layout, out_layout: Layout, tpb: UInt ]( output: LayoutTensor[mut=True, dtype, out_layout], a: LayoutTensor[mut=False, dtype, in_layout], b: LayoutTensor[mut=False, dtype, in_layout], - size: Int, + size: UInt, ): """Dot product using block.sum() - convenience function like warp.sum()! Replaces manual shared memory + barriers + tree reduction with one line.""" @@ -92,9 +92,9 @@ fn block_histogram_bin_extract[ input_data: LayoutTensor[mut=False, dtype, in_layout], bin_output: LayoutTensor[mut=True, dtype, bin_layout], count_output: LayoutTensor[mut=True, DType.int32, out_layout], - size: Int, - target_bin: Int, - num_bins: Int, + size: UInt, + target_bin: UInt, + num_bins: UInt, ): """Parallel histogram using block.prefix_sum() for bin extraction. diff --git a/problems/p29/p29.mojo b/problems/p29/p29.mojo index acc89e8e..b929650b 100644 --- a/problems/p29/p29.mojo +++ b/problems/p29/p29.mojo @@ -34,7 +34,7 @@ fn multi_stage_image_blur_pipeline[ ]( output: LayoutTensor[mut=True, dtype, layout], input: LayoutTensor[mut=False, dtype, layout], - size: Int, + size: UInt, ): """Multi-stage image blur pipeline with barrier coordination. @@ -93,7 +93,7 @@ fn double_buffered_stencil_computation[ ]( output: LayoutTensor[mut=True, dtype, layout], input: LayoutTensor[mut=False, dtype, layout], - size: Int, + size: UInt, ): """Double-buffered stencil computation with memory barrier coordination. diff --git a/problems/p30/p30.mojo b/problems/p30/p30.mojo index 1f708bba..d683abd3 100644 --- a/problems/p30/p30.mojo +++ b/problems/p30/p30.mojo @@ -19,7 +19,7 @@ fn kernel1[ output: LayoutTensor[mut=True, dtype, layout], a: LayoutTensor[mut=False, dtype, layout], b: LayoutTensor[mut=False, dtype, layout], - size: Int, + size: UInt, ): i = block_dim.x * block_idx.x + thread_idx.x if i < size: @@ -36,7 +36,7 @@ fn kernel2[ output: LayoutTensor[mut=True, dtype, layout], a: LayoutTensor[mut=False, dtype, layout], b: LayoutTensor[mut=False, dtype, layout], - size: Int, + size: UInt, ): tid = block_idx.x * block_dim.x + thread_idx.x stride = 512 @@ -57,7 +57,7 @@ fn kernel3[ output: LayoutTensor[mut=True, dtype, layout], a: LayoutTensor[mut=False, dtype, layout], b: LayoutTensor[mut=False, dtype, layout], - size: Int, + size: UInt, ): tid = block_idx.x * block_dim.x + thread_idx.x total_threads = (SIZE // 1024) * 1024 @@ -74,7 +74,7 @@ fn kernel3[ @parameter @always_inline -fn benchmark_kernel1_parameterized[test_size: Int](mut b: Bencher) raises: +fn benchmark_kernel1_parameterized[test_size: UInt](mut b: Bencher) raises: @parameter @always_inline fn kernel1_workflow(ctx: DeviceContext) raises: @@ -109,7 +109,7 @@ fn benchmark_kernel1_parameterized[test_size: Int](mut b: Bencher) raises: @parameter @always_inline -fn benchmark_kernel2_parameterized[test_size: Int](mut b: Bencher) raises: +fn benchmark_kernel2_parameterized[test_size: UInt](mut b: Bencher) raises: @parameter @always_inline fn kernel2_workflow(ctx: DeviceContext) raises: diff --git a/problems/p31/p31.mojo b/problems/p31/p31.mojo index 62bed3f8..f9f29ac7 100644 --- a/problems/p31/p31.mojo +++ b/problems/p31/p31.mojo @@ -21,7 +21,7 @@ fn minimal_kernel[ y: LayoutTensor[mut=True, dtype, layout], x: LayoutTensor[mut=False, dtype, layout], alpha: Float32, - size: Int, + size: UInt, ): """Minimal SAXPY kernel - simple and register-light for high occupancy.""" i = block_dim.x * block_idx.x + thread_idx.x @@ -41,7 +41,7 @@ fn sophisticated_kernel[ y: LayoutTensor[mut=True, dtype, layout], x: LayoutTensor[mut=False, dtype, layout], alpha: Float32, - size: Int, + size: UInt, ): """Sophisticated SAXPY kernel - over-engineered with excessive resource usage. """ @@ -138,7 +138,7 @@ fn balanced_kernel[ y: LayoutTensor[mut=True, dtype, layout], x: LayoutTensor[mut=False, dtype, layout], alpha: Float32, - size: Int, + size: UInt, ): """Balanced SAXPY kernel - efficient optimization with moderate resources. """ @@ -191,7 +191,7 @@ fn balanced_kernel[ @parameter @always_inline -fn benchmark_minimal_parameterized[test_size: Int](mut b: Bencher) raises: +fn benchmark_minimal_parameterized[test_size: UInt](mut b: Bencher) raises: @parameter @always_inline fn minimal_workflow(ctx: DeviceContext) raises: @@ -224,7 +224,7 @@ fn benchmark_minimal_parameterized[test_size: Int](mut b: Bencher) raises: @parameter @always_inline -fn benchmark_sophisticated_parameterized[test_size: Int](mut b: Bencher) raises: +fn benchmark_sophisticated_parameterized[test_size: UInt](mut b: Bencher) raises: @parameter @always_inline fn sophisticated_workflow(ctx: DeviceContext) raises: @@ -257,7 +257,7 @@ fn benchmark_sophisticated_parameterized[test_size: Int](mut b: Bencher) raises: @parameter @always_inline -fn benchmark_balanced_parameterized[test_size: Int](mut b: Bencher) raises: +fn benchmark_balanced_parameterized[test_size: UInt](mut b: Bencher) raises: @parameter @always_inline fn balanced_workflow(ctx: DeviceContext) raises: diff --git a/problems/p32/p32.mojo b/problems/p32/p32.mojo index 71e35fe5..ac76c307 100644 --- a/problems/p32/p32.mojo +++ b/problems/p32/p32.mojo @@ -20,7 +20,7 @@ fn no_conflict_kernel[ ]( output: LayoutTensor[mut=True, dtype, layout], input: LayoutTensor[mut=False, dtype, layout], - size: Int, + size: UInt, ): """Perfect shared memory access - no bank conflicts. @@ -63,7 +63,7 @@ fn two_way_conflict_kernel[ ]( output: LayoutTensor[mut=True, dtype, layout], input: LayoutTensor[mut=False, dtype, layout], - size: Int, + size: UInt, ): """Stride-2 shared memory access - creates 2-way bank conflicts. @@ -107,7 +107,7 @@ fn two_way_conflict_kernel[ @parameter @always_inline -fn benchmark_no_conflict[test_size: Int](mut b: Bencher) raises: +fn benchmark_no_conflict[test_size: UInt](mut b: Bencher) raises: @parameter @always_inline fn kernel_workflow(ctx: DeviceContext) raises: @@ -140,7 +140,7 @@ fn benchmark_no_conflict[test_size: Int](mut b: Bencher) raises: @parameter @always_inline -fn benchmark_two_way_conflict[test_size: Int](mut b: Bencher) raises: +fn benchmark_two_way_conflict[test_size: UInt](mut b: Bencher) raises: @parameter @always_inline fn kernel_workflow(ctx: DeviceContext) raises: diff --git a/problems/p33/p33.mojo b/problems/p33/p33.mojo index 4cf1b28d..ffe93f6a 100644 --- a/problems/p33/p33.mojo +++ b/problems/p33/p33.mojo @@ -23,7 +23,7 @@ alias THREADS_PER_BLOCK_TILED = (TILE_SIZE, TILE_SIZE) # ANCHOR: matmul_idiomatic_tiled_solution fn matmul_idiomatic_tiled[ - layout: Layout, size: Int + layout: Layout, size: UInt ]( output: LayoutTensor[mut=True, dtype, layout], a: LayoutTensor[mut=False, dtype, layout], diff --git a/problems/p34/p34.mojo b/problems/p34/p34.mojo index 6c9bf308..932cd050 100644 --- a/problems/p34/p34.mojo +++ b/problems/p34/p34.mojo @@ -22,11 +22,11 @@ alias out_layout = Layout.row_major(1) # ANCHOR: cluster_coordination_basics fn cluster_coordination_basics[ - in_layout: Layout, out_layout: Layout, tpb: Int + in_layout: Layout, out_layout: Layout, tpb: UInt ]( output: LayoutTensor[mut=True, dtype, out_layout], input: LayoutTensor[mut=False, dtype, in_layout], - size: Int, + size: UInt, ): """Real cluster coordination using SM90+ cluster APIs.""" global_i = block_dim.x * block_idx.x + thread_idx.x @@ -77,12 +77,12 @@ fn cluster_coordination_basics[ # ANCHOR: cluster_collective_operations fn cluster_collective_operations[ - in_layout: Layout, out_layout: Layout, tpb: Int + in_layout: Layout, out_layout: Layout, tpb: UInt ]( output: LayoutTensor[mut=True, dtype, out_layout], input: LayoutTensor[mut=False, dtype, in_layout], temp_storage: LayoutTensor[mut=True, dtype, Layout.row_major(CLUSTER_SIZE)], - size: Int, + size: UInt, ): """Cluster-wide collective operations using real cluster APIs.""" global_i = block_dim.x * block_idx.x + thread_idx.x @@ -96,11 +96,11 @@ fn cluster_collective_operations[ # ANCHOR: advanced_cluster_patterns fn advanced_cluster_patterns[ - in_layout: Layout, out_layout: Layout, tpb: Int + in_layout: Layout, out_layout: Layout, tpb: UInt ]( output: LayoutTensor[mut=True, dtype, out_layout], input: LayoutTensor[mut=False, dtype, in_layout], - size: Int, + size: UInt, ): """Advanced cluster programming using cluster masks and relaxed synchronization. """ From daffcf90f31fc4c31da7389e667beb00c0d72f65 Mon Sep 17 00:00:00 2001 From: raju Date: Fri, 31 Oct 2025 18:51:30 -0700 Subject: [PATCH 3/4] uint fix --- problems/p24/p24.mojo | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/problems/p24/p24.mojo b/problems/p24/p24.mojo index 332f4eb2..c5584811 100644 --- a/problems/p24/p24.mojo +++ b/problems/p24/p24.mojo @@ -206,7 +206,7 @@ fn benchmark_simple_warp_parameterized[ @parameter @always_inline fn benchmark_functional_warp_parameterized[ - test_size: Int + test_size: UInt ](mut bencher: Bencher) raises: alias n_warps = test_size // WARP_SIZE alias in_layout = Layout.row_major(test_size) From 0959976aebceb75f4be3240a71ebbbe030acbba5 Mon Sep 17 00:00:00 2001 From: raju Date: Sat, 1 Nov 2025 10:07:14 -0700 Subject: [PATCH 4/4] fixed formatting issues --- problems/p03/p03.mojo | 2 +- problems/p05/p05_layout_tensor.mojo | 1 + problems/p18/op/softmax.mojo | 2 +- problems/p23/p23.mojo | 10 +++++++--- problems/p24/p24.mojo | 2 +- problems/p25/p25.mojo | 2 +- problems/p31/p31.mojo | 4 +++- 7 files changed, 15 insertions(+), 8 deletions(-) diff --git a/problems/p03/p03.mojo b/problems/p03/p03.mojo index b69420d9..861e8a03 100644 --- a/problems/p03/p03.mojo +++ b/problems/p03/p03.mojo @@ -17,7 +17,7 @@ fn add_10_guard( ): i = thread_idx.x # FILL ME IN (roughly 2 lines) - + # ANCHOR_END: add_10_guard diff --git a/problems/p05/p05_layout_tensor.mojo b/problems/p05/p05_layout_tensor.mojo index 2548e088..99c95655 100644 --- a/problems/p05/p05_layout_tensor.mojo +++ b/problems/p05/p05_layout_tensor.mojo @@ -27,6 +27,7 @@ fn broadcast_add[ col = thread_idx.x # FILL ME IN (roughly 2 lines) + # ANCHOR_END: broadcast_add_layout_tensor def main(): with DeviceContext() as ctx: diff --git a/problems/p18/op/softmax.mojo b/problems/p18/op/softmax.mojo index 8629f162..429c0251 100644 --- a/problems/p18/op/softmax.mojo +++ b/problems/p18/op/softmax.mojo @@ -18,7 +18,7 @@ alias BLOCK_DIM_X = 1 << log2_ceil(SIZE) fn softmax_gpu_kernel[ - layout: Layout, + layout: Layout, input_size: UInt, dtype: DType = DType.float32, ]( diff --git a/problems/p23/p23.mojo b/problems/p23/p23.mojo index db9f9454..40de6055 100644 --- a/problems/p23/p23.mojo +++ b/problems/p23/p23.mojo @@ -79,7 +79,7 @@ fn tiled_elementwise_add[ # ANCHOR: manual_vectorized_tiled_elementwise_add fn manual_vectorized_tiled_elementwise_add[ - layout: Layout, + layout: Layout, dtype: DType, simd_width: UInt, num_threads_per_tile: UInt, @@ -98,7 +98,9 @@ fn manual_vectorized_tiled_elementwise_add[ @parameter @always_inline fn process_manual_vectorized_tiles[ - num_threads_per_tile: UInt, rank: UInt, alignment: UInt = align_of[dtype]() + num_threads_per_tile: UInt, + rank: UInt, + alignment: UInt = align_of[dtype](), ](indices: IndexList[rank]) capturing -> None: tile_id = indices[0] print("tile_id:", tile_id) @@ -137,7 +139,9 @@ fn vectorize_within_tiles_elementwise_add[ @parameter @always_inline fn process_tile_with_vectorize[ - num_threads_per_tile: UInt, rank: UInt, alignment: UInt = align_of[dtype]() + num_threads_per_tile: UInt, + rank: UInt, + alignment: UInt = align_of[dtype](), ](indices: IndexList[rank]) capturing -> None: tile_id = indices[0] tile_start = tile_id * tile_size diff --git a/problems/p24/p24.mojo b/problems/p24/p24.mojo index c5584811..6ebd546e 100644 --- a/problems/p24/p24.mojo +++ b/problems/p24/p24.mojo @@ -135,7 +135,7 @@ fn expected_output[ fn rand_int[ - dtype: DType, size: UInt + dtype: DType, size: UInt ](buff: DeviceBuffer[dtype], min: Int = 0, max: Int = 100) raises: with buff.map_to_host() as buff_host: for i in range(size): diff --git a/problems/p25/p25.mojo b/problems/p25/p25.mojo index 8aaa74a1..812856fd 100644 --- a/problems/p25/p25.mojo +++ b/problems/p25/p25.mojo @@ -65,7 +65,7 @@ fn broadcast_shuffle_coordination[ ]( output: LayoutTensor[mut=True, dtype, layout], input: LayoutTensor[mut=False, dtype, layout], -): +): """ Combine broadcast() and shuffle_down() for advanced warp coordination. Lane 0 computes block-local scaling factor, broadcasts it to all lanes in the warp. diff --git a/problems/p31/p31.mojo b/problems/p31/p31.mojo index f9f29ac7..ed747e64 100644 --- a/problems/p31/p31.mojo +++ b/problems/p31/p31.mojo @@ -224,7 +224,9 @@ fn benchmark_minimal_parameterized[test_size: UInt](mut b: Bencher) raises: @parameter @always_inline -fn benchmark_sophisticated_parameterized[test_size: UInt](mut b: Bencher) raises: +fn benchmark_sophisticated_parameterized[ + test_size: UInt +](mut b: Bencher) raises: @parameter @always_inline fn sophisticated_workflow(ctx: DeviceContext) raises: