Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions challenges/medium/55_max_2d_subarray_sum/challenge.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
<p>
Implement a program that computes the maximum sum of any contiguous 2D subarray of length exactly <code>window_size x window_size</code>. You are given an array <code>input</code> of length <code>N x N</code> consisting of 32-bit signed integers, and an integer <code>window_size</code>.
</p>

<h2>Implementation Requirements</h2>
<ul>
<li>Use only native features (external libraries are not permitted)</li>
<li>The <code>solve</code> function signature must remain unchanged</li>
<li>The final result must be stored in the <code>output</code> variable</li>
</ul>

<h2>Example 1:</h2>
<pre>
Input: input = [[1, 2, 3],
[4, 5, 1],
[5, 1, 7]]
window_size = 2
Output: output = 15
</pre>

<h2>Example 2:</h2>
<pre>
Input: input = [[-1, -2, -3],
[-4, -5, -1],
[-5, -1, -7]]
window_size = 2
Output: output = -11
</pre>

<h2>Constraints</h2>
<ul>
<li>1 &le; <code>N</code> &le; 5,000</li>
<li>-10 &le; <code>input[i]</code> &le; 10</li>
<li>1 &le; <code>window_size</code> &le; <code>N</code></li>
</ul>
139 changes: 139 additions & 0 deletions challenges/medium/55_max_2d_subarray_sum/challenge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import ctypes
from typing import Any, List, Dict
import torch
from core.challenge_base import ChallengeBase

class Challenge(ChallengeBase):
def __init__(self):
super().__init__(
name="Max 2D Subarray Sum",
atol=1e-05,
rtol=1e-05,
num_gpus=1,
access_tier="free"
)

def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int, window_size: int):
# Validate input types and shapes
assert input.shape == (N, N)
assert output.shape == (1,)
assert input.dtype == torch.int32
assert output.dtype == torch.int32

psum = input.cumsum(dim=0).cumsum(dim=1)
padded = torch.zeros((N+1, N+1), dtype=torch.int32)
Copy link

Copilot AI Jan 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The padded tensor is created on CPU by default with torch.zeros, but the input tensor is on GPU (cuda device). This will cause a runtime error when trying to assign psum to padded[1:, 1:] since tensors need to be on the same device. The padded tensor should be created on the same device as the input tensor.

Suggested change
padded = torch.zeros((N+1, N+1), dtype=torch.int32)
padded = torch.zeros((N+1, N+1), dtype=torch.int32, device=input.device)

Copilot uses AI. Check for mistakes.
padded[1:, 1:] = psum

top_left = padded[:-window_size, :-window_size]
top_right = padded[:-window_size, window_size:]
bottom_left = padded[window_size:, :-window_size]
bottom_right = padded[window_size:, window_size:]
window_sums = bottom_right - top_right - bottom_left + top_left

max_sum = torch.max(window_sums)
output[0] = max_sum

def get_solve_signature(self) -> Dict[str, Any]:
return {
"input": ctypes.POINTER(ctypes.c_int),
"output": ctypes.POINTER(ctypes.c_int),
"N": ctypes.c_int,
"window_size": ctypes.c_int
}

def generate_example_test(self) -> Dict[str, Any]:
dtype = torch.int32
input = torch.tensor([[1, 2, 3], [4, 5, 1], [5, 1, 7]], device="cuda", dtype=dtype)
output = torch.empty(1, device="cuda", dtype=dtype)
return {
"input": input,
"output": output,
"N": 3,
"window_size": 2
}

def generate_functional_test(self) -> List[Dict[str, Any]]:
dtype = torch.int32
tests = []

# basic_example
tests.append({
"input": torch.tensor([[-1, -2, -3], [-4, -5, -1], [-5, -1, -7]], device="cuda", dtype=dtype),
"output": torch.empty(1, device="cuda", dtype=dtype),
"N": 3,
"window_size": 2
})

# all_same_value
tests.append({
"input": torch.tensor([[2]*16] * 16, device="cuda", dtype=dtype),
"output": torch.empty(1, device="cuda", dtype=dtype),
"N": 16,
"window_size": 16
})

tests.append({
"input": torch.tensor([[2]*16] * 16, device="cuda", dtype=dtype),
"output": torch.empty(1, device="cuda", dtype=dtype),
"N": 16,
"window_size": 15
})

tests.append({
"input": torch.tensor([[2]*16] * 16, device="cuda", dtype=dtype),
"output": torch.empty(1, device="cuda", dtype=dtype),
"N": 16,
"window_size": 1
})

# all_minus_value
tests.append({
"input": torch.tensor([[-10]*10]*10, device="cuda", dtype=dtype),
"output": torch.empty(1, device="cuda", dtype=dtype),
"N": 10,
"window_size": 5
})

tests.append({
"input": torch.randint(-10, 0, (123, 123), device="cuda", dtype=dtype),
"output": torch.empty(1, device="cuda", dtype=dtype),
"N": 123,
"window_size": 7
})

# increasing_sequence
Copy link

Copilot AI Jan 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment says "increasing_sequence" but the test generates random integers using torch.randint(-10, 11, ...). This comment is misleading and should be updated to reflect what the test actually does, such as "mixed_positive_negative" or "random_values".

Suggested change
# increasing_sequence
# mixed_positive_negative

Copilot uses AI. Check for mistakes.
tests.append({
"input": torch.randint(-10, 11, (123, 123), device="cuda", dtype=dtype),
"output": torch.empty(1, device="cuda", dtype=dtype),
"N": 123,
"window_size": 7
})

# medium_size
tests.append({
"input": torch.randint(-10, 11, (1000, 1000), device="cuda", dtype=dtype),
"output": torch.empty(1, device="cuda", dtype=dtype),
"N": 1000,
"window_size": 476
})

# large_size
tests.append({
"input": torch.randint(-10, 11, (3000, 3000), device="cuda", dtype=dtype),
"output": torch.empty(1, device="cuda", dtype=dtype),
"N": 3000,
"window_size": 2011
})

return tests

def generate_performance_test(self) -> Dict[str, Any]:
dtype = torch.int32
input = torch.randint(-10, 11, (5000, 5000), device="cuda", dtype=dtype)
output = torch.empty(1, device="cuda", dtype=dtype)
return {
"input": input,
"output": output,
"N": 5000,
"window_size": 2500
}
6 changes: 6 additions & 0 deletions challenges/medium/55_max_2d_subarray_sum/starter/starter.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#include <cuda_runtime.h>

// input, output are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const int* input, int* output, int N, int window_size) {

Copy link

Copilot AI Jan 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is trailing whitespace on this line. This should be removed for code cleanliness.

Suggested change

Copilot uses AI. Check for mistakes.
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from gpu.host import DeviceContext
from gpu.id import block_dim, block_idx, thread_idx
from memory import UnsafePointer
from math import ceildiv

# input, output are device pointers (i.e. pointers to memory on the GPU)
@export
Copy link

Copilot AI Jan 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is trailing whitespace after the @export decorator. This should be removed for code cleanliness.

Suggested change
@export
@export

Copilot uses AI. Check for mistakes.
def solve(input: UnsafePointer[Int32], output: UnsafePointer[Int32], N: Int32, window_size: Int32):
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import torch

# input, output are tensors on the GPU
def solve(input: torch.Tensor, output: torch.Tensor, N: int, window_size: int):
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import torch
import triton
Copy link

Copilot AI Jan 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Import of 'triton' is not used.

Suggested change
import triton

Copilot uses AI. Check for mistakes.
import triton.language as tl
Copy link

Copilot AI Jan 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Import of 'tl' is not used.

Suggested change
import triton.language as tl

Copilot uses AI. Check for mistakes.

# input, output are tensors on the GPU
def solve(input: torch.Tensor, output: torch.Tensor, N: int, window_size: int):
pass