Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
507 changes: 507 additions & 0 deletions CLAUDE.md

Large diffs are not rendered by default.

43 changes: 43 additions & 0 deletions challenges/easy/65_rgb_to_grayscale/challenge.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
<p>
Implement a GPU program that converts an RGB image to grayscale on the GPU.
Given an input RGB image represented as a 1D array of 32-bit floating point values,
compute the corresponding grayscale image using the standard RGB to grayscale conversion formula.
</p>

<p>
The conversion formula is: <code>gray = 0.299 × R + 0.587 × G + 0.114 × B</code>
</p>

<p>
The input array <code>input</code> contains <code>height × width × 3</code> elements,
where the RGB values for each pixel are stored consecutively (R, G, B, R, G, B, ...).
The output array <code>output</code> should contain <code>height × width</code> grayscale values.
</p>

<h2>Implementation Requirements</h2>
<ul>
<li>External libraries are not permitted</li>
<li>The <code>solve</code> function signature must remain unchanged</li>
<li>The final result must be stored in the array <code>output</code></li>
<li>Use the exact coefficients: 0.299 for red, 0.587 for green, 0.114 for blue</li>
</ul>

<h2>Example 1:</h2>
<pre>
Input: input = [255.0, 0.0, 0.0, 0.0, 255.0, 0.0, 0.0, 0.0, 255.0, 128.0, 128.0, 128.0], width=2, height=2
Output: output = [76.245, 149.685, 29.07, 128.0]
</pre>

<h2>Example 2:</h2>
<pre>
Input: input = [100.0, 150.0, 200.0], width=1, height=1
Output: output = [140.75]
</pre>

<h2>Constraints</h2>
<ul>
<li>1 ≤ <code>width</code> ≤ 4096</li>
<li>1 ≤ <code>height</code> ≤ 4096</li>
<li><code>width × height</code> ≤ 4,194,304</li>
<li>All RGB values are in the range [0.0, 255.0]</li>
</ul>
203 changes: 203 additions & 0 deletions challenges/easy/65_rgb_to_grayscale/challenge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
import ctypes
from typing import Any, Dict, List

import torch
from core.challenge_base import ChallengeBase


class Challenge(ChallengeBase):
def __init__(self):
super().__init__(
name="RGB to Grayscale", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier="free"
)

def reference_impl(self, input: torch.Tensor, output: torch.Tensor, width: int, height: int):
assert input.shape == (height * width * 3,)
assert output.shape == (height * width,)
assert input.dtype == output.dtype == torch.float32
assert input.device == output.device

# Reshape input to (height, width, 3) for easier processing
rgb_image = input.view(height, width, 3)

# Apply RGB to grayscale conversion: gray = 0.299*R + 0.587*G + 0.114*B
grayscale = (
0.299 * rgb_image[:, :, 0] + 0.587 * rgb_image[:, :, 1] + 0.114 * rgb_image[:, :, 2]
)

# Flatten and store in output
output.copy_(grayscale.flatten())

def get_solve_signature(self) -> Dict[str, tuple]:
return {
"input": (ctypes.POINTER(ctypes.c_float), "in"),
"output": (ctypes.POINTER(ctypes.c_float), "out"),
"width": (ctypes.c_int, "in"),
"height": (ctypes.c_int, "in"),
}

def generate_example_test(self) -> Dict[str, Any]:
width, height = 2, 2
# RGB values for a 2x2 image
# Pixel (0,0): R=255, G=0, B=0 (red)
# Pixel (0,1): R=0, G=255, B=0 (green)
# Pixel (1,0): R=0, G=0, B=255 (blue)
# Pixel (1,1): R=128, G=128, B=128 (gray)
input_data = torch.tensor(
[
255.0,
0.0,
0.0, # red
0.0,
255.0,
0.0, # green
0.0,
0.0,
255.0, # blue
128.0,
128.0,
128.0, # gray
],
device="cuda",
dtype=torch.float32,
)
output = torch.zeros(width * height, device="cuda", dtype=torch.float32)
return {
"input": input_data,
"output": output,
"width": width,
"height": height,
}

def generate_functional_test(self) -> List[Dict[str, Any]]:
test_cases = []

# Small test cases
test_cases.append(
{
"input": torch.tensor(
[255.0, 0.0, 0.0], device="cuda", dtype=torch.float32
), # red pixel
"output": torch.zeros(1, device="cuda", dtype=torch.float32),
"width": 1,
"height": 1,
}
)

test_cases.append(
{
"input": torch.tensor(
[0.0, 255.0, 0.0], device="cuda", dtype=torch.float32
), # green pixel
"output": torch.zeros(1, device="cuda", dtype=torch.float32),
"width": 1,
"height": 1,
}
)

test_cases.append(
{
"input": torch.tensor(
[0.0, 0.0, 255.0], device="cuda", dtype=torch.float32
), # blue pixel
"output": torch.zeros(1, device="cuda", dtype=torch.float32),
"width": 1,
"height": 1,
}
)

# 2x2 test case
test_cases.append(
{
"input": torch.tensor(
[
100.0,
150.0,
200.0, # mixed color 1
50.0,
75.0,
100.0, # mixed color 2
200.0,
100.0,
50.0, # mixed color 3
75.0,
125.0,
175.0, # mixed color 4
],
device="cuda",
dtype=torch.float32,
),
"output": torch.zeros(4, device="cuda", dtype=torch.float32),
"width": 2,
"height": 2,
}
)

# Edge cases: zeros and max values
test_cases.append(
{
"input": torch.zeros(3, device="cuda", dtype=torch.float32),
"output": torch.zeros(1, device="cuda", dtype=torch.float32),
"width": 1,
"height": 1,
}
)

test_cases.append(
{
"input": torch.full((3,), 255.0, device="cuda", dtype=torch.float32),
"output": torch.zeros(1, device="cuda", dtype=torch.float32),
"width": 1,
"height": 1,
}
)

# Larger test cases
for size in [4, 8, 16, 32]:
input_size = size * size * 3
test_cases.append(
{
"input": torch.randint(
0, 256, (input_size,), device="cuda", dtype=torch.float32
),
"output": torch.zeros(size * size, device="cuda", dtype=torch.float32),
"width": size,
"height": size,
}
)

# Non-square images
test_cases.append(
{
"input": torch.randint(
0, 256, (2 * 3 * 3,), device="cuda", dtype=torch.float32
), # 2x3 image
"output": torch.zeros(2 * 3, device="cuda", dtype=torch.float32),
"width": 3,
"height": 2,
}
)

test_cases.append(
{
"input": torch.randint(
0, 256, (3 * 2 * 3,), device="cuda", dtype=torch.float32
), # 3x2 image
"output": torch.zeros(3 * 2, device="cuda", dtype=torch.float32),
"width": 2,
"height": 3,
}
)

return test_cases

def generate_performance_test(self) -> Dict[str, Any]:
width, height = 2048, 2048
input_size = width * height * 3
output_size = width * height
return {
"input": torch.randint(0, 256, (input_size,), device="cuda", dtype=torch.float32),
"output": torch.zeros(output_size, device="cuda", dtype=torch.float32),
"width": width,
"height": height,
}
13 changes: 13 additions & 0 deletions challenges/easy/65_rgb_to_grayscale/starter/starter.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#include <cuda_runtime.h>

__global__ void rgb_to_grayscale_kernel(const float* input, float* output, int width, int height) {}

// input, output are device pointers
extern "C" void solve(const float* input, float* output, int width, int height) {
int total_pixels = width * height;
int threadsPerBlock = 256;
int blocksPerGrid = (total_pixels + threadsPerBlock - 1) / threadsPerBlock;

rgb_to_grayscale_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, width, height);
cudaDeviceSynchronize();
}
8 changes: 8 additions & 0 deletions challenges/easy/65_rgb_to_grayscale/starter/starter.cute.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import cutlass
import cutlass.cute as cute


# input, output are tensors on the GPU
@cute.jit
def solve(input: cute.Tensor, output: cute.Tensor, width: cute.Uint32, height: cute.Uint32):
pass
7 changes: 7 additions & 0 deletions challenges/easy/65_rgb_to_grayscale/starter/starter.jax.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import jax
import jax.numpy as jnp


# input, output are arrays on the GPU
def solve(input, output, width: int, height: int):
pass
24 changes: 24 additions & 0 deletions challenges/easy/65_rgb_to_grayscale/starter/starter.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from gpu.host import DeviceContext
from gpu.id import block_dim, block_idx, thread_idx
from memory import UnsafePointer
from math import ceildiv

fn rgb_to_grayscale_kernel(input: UnsafePointer[Float32], output: UnsafePointer[Float32], width: Int32, height: Int32):
pass

# input, output are device pointers
@export
def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], width: Int32, height: Int32):
var total_pixels = width * height
var BLOCK_SIZE: Int32 = 256
var ctx = DeviceContext()
var num_blocks = ceildiv(total_pixels, BLOCK_SIZE)

ctx.enqueue_function[rgb_to_grayscale_kernel](
input, output, width, height,
grid_dim = num_blocks,
block_dim = BLOCK_SIZE
)

ctx.synchronize()

Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import torch


# input, output are tensors on the GPU
def solve(input: torch.Tensor, output: torch.Tensor, width: int, height: int):
pass
16 changes: 16 additions & 0 deletions challenges/easy/65_rgb_to_grayscale/starter/starter.triton.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import torch
import triton
import triton.language as tl


@triton.jit
def rgb_to_grayscale_kernel(input, output, width, height, BLOCK_SIZE: tl.constexpr):
pass


# input, output are tensors on the GPU
def solve(input: torch.Tensor, output: torch.Tensor, width: int, height: int):
total_pixels = width * height
BLOCK_SIZE = 1024
grid = (triton.cdiv(total_pixels, BLOCK_SIZE),)
rgb_to_grayscale_kernel[grid](input, output, width, height, BLOCK_SIZE)