AlphaGPU · 23silicon · Jan 14, 2026 · Jan 15, 2026 · Jan 16, 2026 · Jan 20, 2026
@@ -0,0 +1,43 @@
+<p>
+  Implement a GPU program that converts an RGB image to grayscale on the GPU.
+  Given an input RGB image represented as a 1D array of 32-bit floating point values,
+  compute the corresponding grayscale image using the standard RGB to grayscale conversion formula.
+</p>
+
+<p>
+  The conversion formula is: <code>gray = 0.299 × R + 0.587 × G + 0.114 × B</code>
+</p>
+
+<p>
+  The input array <code>input</code> contains <code>height × width × 3</code> elements,
+  where the RGB values for each pixel are stored consecutively (R, G, B, R, G, B, ...).
+  The output array <code>output</code> should contain <code>height × width</code> grayscale values.
+</p>
+
+<h2>Implementation Requirements</h2>
+<ul>
+  <li>External libraries are not permitted</li>
+  <li>The <code>solve</code> function signature must remain unchanged</li>
+  <li>The final result must be stored in the array <code>output</code></li>
+  <li>Use the exact coefficients: 0.299 for red, 0.587 for green, 0.114 for blue</li>
+</ul>
+
+<h2>Example 1:</h2>
+<pre>
+Input:  input = [255.0, 0.0, 0.0, 0.0, 255.0, 0.0, 0.0, 0.0, 255.0, 128.0, 128.0, 128.0], width=2, height=2
+Output: output = [76.245, 149.685, 29.07, 128.0]
+</pre>
+
+<h2>Example 2:</h2>
+<pre>
+Input:  input = [100.0, 150.0, 200.0], width=1, height=1
+Output: output = [140.75]
+</pre>
+
+<h2>Constraints</h2>
+<ul>
+  <li>1 ≤ <code>width</code> ≤ 4096</li>
+  <li>1 ≤ <code>height</code> ≤ 4096</li>
+  <li><code>width × height</code> ≤ 4,194,304</li>
+  <li>All RGB values are in the range [0.0, 255.0]</li>
+</ul>
@@ -0,0 +1,203 @@
+import ctypes
+from typing import Any, Dict, List
+
+import torch
+from core.challenge_base import ChallengeBase
+
+
+class Challenge(ChallengeBase):
+    def __init__(self):
+        super().__init__(
+            name="RGB to Grayscale", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier="free"
+        )
+
+    def reference_impl(self, input: torch.Tensor, output: torch.Tensor, width: int, height: int):
+        assert input.shape == (height * width * 3,)
+        assert output.shape == (height * width,)
+        assert input.dtype == output.dtype == torch.float32
+        assert input.device == output.device
+
+        # Reshape input to (height, width, 3) for easier processing
+        rgb_image = input.view(height, width, 3)
+
+        # Apply RGB to grayscale conversion: gray = 0.299*R + 0.587*G + 0.114*B
+        grayscale = (
+            0.299 * rgb_image[:, :, 0] + 0.587 * rgb_image[:, :, 1] + 0.114 * rgb_image[:, :, 2]
+        )
+
+        # Flatten and store in output
+        output.copy_(grayscale.flatten())
+
+    def get_solve_signature(self) -> Dict[str, tuple]:
+        return {
+            "input": (ctypes.POINTER(ctypes.c_float), "in"),
+            "output": (ctypes.POINTER(ctypes.c_float), "out"),
+            "width": (ctypes.c_int, "in"),
+            "height": (ctypes.c_int, "in"),
+        }
+
+    def generate_example_test(self) -> Dict[str, Any]:
+        width, height = 2, 2
+        # RGB values for a 2x2 image
+        # Pixel (0,0): R=255, G=0, B=0 (red)
+        # Pixel (0,1): R=0, G=255, B=0 (green)
+        # Pixel (1,0): R=0, G=0, B=255 (blue)
+        # Pixel (1,1): R=128, G=128, B=128 (gray)
+        input_data = torch.tensor(
+            [
+                255.0,
+                0.0,
+                0.0,  # red
+                0.0,
+                255.0,
+                0.0,  # green
+                0.0,
+                0.0,
+                255.0,  # blue
+                128.0,
+                128.0,
+                128.0,  # gray
+            ],
+            device="cuda",
+            dtype=torch.float32,
+        )
+        output = torch.zeros(width * height, device="cuda", dtype=torch.float32)
+        return {
+            "input": input_data,
+            "output": output,
+            "width": width,
+            "height": height,
+        }
+
+    def generate_functional_test(self) -> List[Dict[str, Any]]:
+        test_cases = []
+
+        # Small test cases
+        test_cases.append(
+            {
+                "input": torch.tensor(
+                    [255.0, 0.0, 0.0], device="cuda", dtype=torch.float32
+                ),  # red pixel
+                "output": torch.zeros(1, device="cuda", dtype=torch.float32),
+                "width": 1,
+                "height": 1,
+            }
+        )
+
+        test_cases.append(
+            {
+                "input": torch.tensor(
+                    [0.0, 255.0, 0.0], device="cuda", dtype=torch.float32
+                ),  # green pixel
+                "output": torch.zeros(1, device="cuda", dtype=torch.float32),
+                "width": 1,
+                "height": 1,
+            }
+        )
+
+        test_cases.append(
+            {
+                "input": torch.tensor(
+                    [0.0, 0.0, 255.0], device="cuda", dtype=torch.float32
+                ),  # blue pixel
+                "output": torch.zeros(1, device="cuda", dtype=torch.float32),
+                "width": 1,
+                "height": 1,
+            }
+        )
+
+        # 2x2 test case
+        test_cases.append(
+            {
+                "input": torch.tensor(
+                    [
+                        100.0,
+                        150.0,
+                        200.0,  # mixed color 1
+                        50.0,
+                        75.0,
+                        100.0,  # mixed color 2
+                        200.0,
+                        100.0,
+                        50.0,  # mixed color 3
+                        75.0,
+                        125.0,
+                        175.0,  # mixed color 4
+                    ],
+                    device="cuda",
+                    dtype=torch.float32,
+                ),
+                "output": torch.zeros(4, device="cuda", dtype=torch.float32),
+                "width": 2,
+                "height": 2,
+            }
+        )
+
+        # Edge cases: zeros and max values
+        test_cases.append(
+            {
+                "input": torch.zeros(3, device="cuda", dtype=torch.float32),
+                "output": torch.zeros(1, device="cuda", dtype=torch.float32),
+                "width": 1,
+                "height": 1,
+            }
+        )
+
+        test_cases.append(
+            {
+                "input": torch.full((3,), 255.0, device="cuda", dtype=torch.float32),
+                "output": torch.zeros(1, device="cuda", dtype=torch.float32),
+                "width": 1,
+                "height": 1,
+            }
+        )
+
+        # Larger test cases
+        for size in [4, 8, 16, 32]:
+            input_size = size * size * 3
+            test_cases.append(
+                {
+                    "input": torch.randint(
+                        0, 256, (input_size,), device="cuda", dtype=torch.float32
+                    ),
+                    "output": torch.zeros(size * size, device="cuda", dtype=torch.float32),
+                    "width": size,
+                    "height": size,
+                }
+            )
+
+        # Non-square images
+        test_cases.append(
+            {
+                "input": torch.randint(
+                    0, 256, (2 * 3 * 3,), device="cuda", dtype=torch.float32
+                ),  # 2x3 image
+                "output": torch.zeros(2 * 3, device="cuda", dtype=torch.float32),
+                "width": 3,
+                "height": 2,
+            }
+        )
+
+        test_cases.append(
+            {
+                "input": torch.randint(
+                    0, 256, (3 * 2 * 3,), device="cuda", dtype=torch.float32
+                ),  # 3x2 image
+                "output": torch.zeros(3 * 2, device="cuda", dtype=torch.float32),
+                "width": 2,
+                "height": 3,
+            }
+        )
+
+        return test_cases
+
+    def generate_performance_test(self) -> Dict[str, Any]:
+        width, height = 2048, 2048
+        input_size = width * height * 3
+        output_size = width * height
+        return {
+            "input": torch.randint(0, 256, (input_size,), device="cuda", dtype=torch.float32),
+            "output": torch.zeros(output_size, device="cuda", dtype=torch.float32),
+            "width": width,
+            "height": height,
+        }
@@ -0,0 +1,13 @@
+#include <cuda_runtime.h>
+
+__global__ void rgb_to_grayscale_kernel(const float* input, float* output, int width, int height) {}
+
+// input, output are device pointers
+extern "C" void solve(const float* input, float* output, int width, int height) {
+    int total_pixels = width * height;
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (total_pixels + threadsPerBlock - 1) / threadsPerBlock;
+
+    rgb_to_grayscale_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, width, height);
+    cudaDeviceSynchronize();
+}
@@ -0,0 +1,8 @@
+import cutlass
+import cutlass.cute as cute
+
+
+# input, output are tensors on the GPU
+@cute.jit
+def solve(input: cute.Tensor, output: cute.Tensor, width: cute.Uint32, height: cute.Uint32):
+    pass
@@ -0,0 +1,7 @@
+import jax
+import jax.numpy as jnp
+
+
+# input, output are arrays on the GPU
+def solve(input, output, width: int, height: int):
+    pass
@@ -0,0 +1,24 @@
+from gpu.host import DeviceContext
+from gpu.id import block_dim, block_idx, thread_idx
+from memory import UnsafePointer
+from math import ceildiv
+
+fn rgb_to_grayscale_kernel(input: UnsafePointer[Float32], output: UnsafePointer[Float32], width: Int32, height: Int32):
+    pass
+
+# input, output are device pointers
+@export
+def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], width: Int32, height: Int32):
+    var total_pixels = width * height
+    var BLOCK_SIZE: Int32 = 256
+    var ctx = DeviceContext()
+    var num_blocks = ceildiv(total_pixels, BLOCK_SIZE)
+
+    ctx.enqueue_function[rgb_to_grayscale_kernel](
+        input, output, width, height,
+        grid_dim  = num_blocks,
+        block_dim = BLOCK_SIZE
+    )
+
+    ctx.synchronize()
+
@@ -0,0 +1,6 @@
+import torch
+
+
+# input, output are tensors on the GPU
+def solve(input: torch.Tensor, output: torch.Tensor, width: int, height: int):
+    pass
@@ -0,0 +1,16 @@
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def rgb_to_grayscale_kernel(input, output, width, height, BLOCK_SIZE: tl.constexpr):
+    pass
+
+
+# input, output are tensors on the GPU
+def solve(input: torch.Tensor, output: torch.Tensor, width: int, height: int):
+    total_pixels = width * height
+    BLOCK_SIZE = 1024
+    grid = (triton.cdiv(total_pixels, BLOCK_SIZE),)
+    rgb_to_grayscale_kernel[grid](input, output, width, height, BLOCK_SIZE)