AlphaGPU · SungHwanYun · Sep 10, 2025 · Copilot · Jan 3, 2026 · Copilot
@@ -0,0 +1,35 @@
+<p>
+    Implement a program that computes the maximum sum of any contiguous 2D subarray of length exactly <code>window_size x window_size</code>. You are given an array <code>input</code> of length <code>N x N</code> consisting of 32-bit signed integers, and an integer <code>window_size</code>.
+</p>
+
+<h2>Implementation Requirements</h2>
+<ul>
+    <li>Use only native features (external libraries are not permitted)</li>
+    <li>The <code>solve</code> function signature must remain unchanged</li>
+    <li>The final result must be stored in the <code>output</code> variable</li>
+</ul>
+
+<h2>Example 1:</h2>
+<pre>
+Input:  input = [[1, 2, 3],
+                 [4, 5, 1],
+                 [5, 1, 7]]
+        window_size = 2
+Output: output = 15
+</pre>
+
+<h2>Example 2:</h2>
+<pre>
+Input:  input = [[-1, -2, -3],
+                 [-4, -5, -1],
+                 [-5, -1, -7]]
+        window_size = 2
+Output: output = -11
+</pre>
+
+<h2>Constraints</h2>
+<ul>
+    <li>1 &le; <code>N</code> &le; 5,000</li>
+    <li>-10 &le; <code>input[i]</code> &le; 10</li>
+    <li>1 &le; <code>window_size</code> &le; <code>N</code></li>
+</ul>
@@ -0,0 +1,139 @@
+import ctypes
+from typing import Any, List, Dict
+import torch
+from core.challenge_base import ChallengeBase
+
+class Challenge(ChallengeBase):
+    def __init__(self):
+        super().__init__(
+            name="Max 2D Subarray Sum",
+            atol=1e-05,
+            rtol=1e-05,
+            num_gpus=1,
+            access_tier="free"
+        )
+
+    def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int, window_size: int):
+        # Validate input types and shapes
+        assert input.shape == (N, N)
+        assert output.shape == (1,)
+        assert input.dtype == torch.int32
+        assert output.dtype == torch.int32
+
+        psum = input.cumsum(dim=0).cumsum(dim=1)
+        padded = torch.zeros((N+1, N+1), dtype=torch.int32)
-        padded = torch.zeros((N+1, N+1), dtype=torch.int32)
+        padded = torch.zeros((N+1, N+1), dtype=torch.int32, device=input.device)
-        padded = torch.zeros((N+1, N+1), dtype=torch.int32)
+        padded = torch.zeros((N+1, N+1), dtype=torch.int32, device=input.device)
+        padded[1:, 1:] = psum
+
+        top_left = padded[:-window_size, :-window_size]
+        top_right = padded[:-window_size, window_size:]
+        bottom_left = padded[window_size:, :-window_size]
+        bottom_right = padded[window_size:, window_size:]
+        window_sums = bottom_right - top_right - bottom_left + top_left
+
+        max_sum = torch.max(window_sums)
+        output[0] = max_sum
+
+    def get_solve_signature(self) -> Dict[str, Any]:
+        return {
+            "input": ctypes.POINTER(ctypes.c_int),
+            "output": ctypes.POINTER(ctypes.c_int),
+            "N": ctypes.c_int,
+            "window_size": ctypes.c_int
+        }
+
+    def generate_example_test(self) -> Dict[str, Any]:
+        dtype = torch.int32
+        input = torch.tensor([[1, 2, 3], [4, 5, 1], [5, 1, 7]], device="cuda", dtype=dtype)
+        output = torch.empty(1, device="cuda", dtype=dtype)
+        return {
+            "input": input,
+            "output": output,
+            "N": 3,
+            "window_size": 2
+        }
+
+    def generate_functional_test(self) -> List[Dict[str, Any]]:
+        dtype = torch.int32
+        tests = []
+
+        # basic_example
+        tests.append({
+            "input": torch.tensor([[-1, -2, -3], [-4, -5, -1], [-5, -1, -7]], device="cuda", dtype=dtype),
+            "output": torch.empty(1, device="cuda", dtype=dtype),
+            "N": 3,
+            "window_size": 2
+        })
+
+        # all_same_value
+        tests.append({
+            "input": torch.tensor([[2]*16] * 16, device="cuda", dtype=dtype),
+            "output": torch.empty(1, device="cuda", dtype=dtype),
+            "N": 16,
+            "window_size": 16
+        })
+
+        tests.append({
+            "input": torch.tensor([[2]*16] * 16, device="cuda", dtype=dtype),
+            "output": torch.empty(1, device="cuda", dtype=dtype),
+            "N": 16,
+            "window_size": 15
+        })
+
+        tests.append({
+            "input": torch.tensor([[2]*16] * 16, device="cuda", dtype=dtype),
+            "output": torch.empty(1, device="cuda", dtype=dtype),
+            "N": 16,
+            "window_size": 1
+        })
+
+        # all_minus_value
+        tests.append({
+            "input": torch.tensor([[-10]*10]*10, device="cuda", dtype=dtype),
+            "output": torch.empty(1, device="cuda", dtype=dtype),
+            "N": 10,
+            "window_size": 5
+        })
+
+        tests.append({
+            "input": torch.randint(-10, 0, (123, 123), device="cuda", dtype=dtype),
+            "output": torch.empty(1, device="cuda", dtype=dtype),
+            "N": 123,
+            "window_size": 7
+        })
+
+        # increasing_sequence
-        # increasing_sequence
+        # mixed_positive_negative
-        # increasing_sequence
+        # mixed_positive_negative
+        tests.append({
+            "input": torch.randint(-10, 11, (123, 123), device="cuda", dtype=dtype),
+            "output": torch.empty(1, device="cuda", dtype=dtype),
+            "N": 123,
+            "window_size": 7
+        })
+
+        # medium_size
+        tests.append({
+            "input": torch.randint(-10, 11, (1000, 1000), device="cuda", dtype=dtype),
+            "output": torch.empty(1, device="cuda", dtype=dtype),
+            "N": 1000,
+            "window_size": 476
+        })
+
+        # large_size
+        tests.append({
+            "input": torch.randint(-10, 11, (3000, 3000), device="cuda", dtype=dtype),
+            "output": torch.empty(1, device="cuda", dtype=dtype),
+            "N": 3000,
+            "window_size": 2011
+        })
+
+        return tests
+
+    def generate_performance_test(self) -> Dict[str, Any]:
+        dtype = torch.int32
+        input = torch.randint(-10, 11, (5000, 5000), device="cuda", dtype=dtype)
+        output = torch.empty(1, device="cuda", dtype=dtype)
+        return {
+            "input": input,
+            "output": output,
+            "N": 5000,
+            "window_size": 2500
+        }
@@ -0,0 +1,6 @@
+#include <cuda_runtime.h>
+
+// input, output are device pointers (i.e. pointers to memory on the GPU)
+extern "C" void solve(const int* input, int* output, int N, int window_size) {
+
- 
- 
+}
@@ -0,0 +1,9 @@
+from gpu.host import DeviceContext
+from gpu.id import block_dim, block_idx, thread_idx
+from memory import UnsafePointer
+from math import ceildiv
+
+# input, output are device pointers (i.e. pointers to memory on the GPU)
+@export                         
-@export                         
+@export
-@export                         
+@export
+def solve(input: UnsafePointer[Int32], output: UnsafePointer[Int32], N: Int32, window_size: Int32):
+    pass
@@ -0,0 +1,5 @@
+import torch
+
+# input, output are tensors on the GPU
+def solve(input: torch.Tensor, output: torch.Tensor, N: int, window_size: int):
+    pass
@@ -0,0 +1,7 @@
+import torch
+import triton
-import triton
-import triton
+import triton.language as tl
-import triton.language as tl
-import triton.language as tl
+
+# input, output are tensors on the GPU
+def solve(input: torch.Tensor, output: torch.Tensor, N: int, window_size: int):
+    pass