-
Notifications
You must be signed in to change notification settings - Fork 50
add max 2d subarray sum #81
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,35 @@ | ||
| <p> | ||
| Implement a program that computes the maximum sum of any contiguous 2D subarray of length exactly <code>window_size x window_size</code>. You are given an array <code>input</code> of length <code>N x N</code> consisting of 32-bit signed integers, and an integer <code>window_size</code>. | ||
| </p> | ||
|
|
||
| <h2>Implementation Requirements</h2> | ||
| <ul> | ||
| <li>Use only native features (external libraries are not permitted)</li> | ||
| <li>The <code>solve</code> function signature must remain unchanged</li> | ||
| <li>The final result must be stored in the <code>output</code> variable</li> | ||
| </ul> | ||
|
|
||
| <h2>Example 1:</h2> | ||
| <pre> | ||
| Input: input = [[1, 2, 3], | ||
| [4, 5, 1], | ||
| [5, 1, 7]] | ||
| window_size = 2 | ||
| Output: output = 15 | ||
| </pre> | ||
|
|
||
| <h2>Example 2:</h2> | ||
| <pre> | ||
| Input: input = [[-1, -2, -3], | ||
| [-4, -5, -1], | ||
| [-5, -1, -7]] | ||
| window_size = 2 | ||
| Output: output = -11 | ||
| </pre> | ||
|
|
||
| <h2>Constraints</h2> | ||
| <ul> | ||
| <li>1 ≤ <code>N</code> ≤ 5,000</li> | ||
| <li>-10 ≤ <code>input[i]</code> ≤ 10</li> | ||
| <li>1 ≤ <code>window_size</code> ≤ <code>N</code></li> | ||
| </ul> |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,139 @@ | ||||||
| import ctypes | ||||||
| from typing import Any, List, Dict | ||||||
| import torch | ||||||
| from core.challenge_base import ChallengeBase | ||||||
|
|
||||||
| class Challenge(ChallengeBase): | ||||||
| def __init__(self): | ||||||
| super().__init__( | ||||||
| name="Max 2D Subarray Sum", | ||||||
| atol=1e-05, | ||||||
| rtol=1e-05, | ||||||
| num_gpus=1, | ||||||
| access_tier="free" | ||||||
| ) | ||||||
|
|
||||||
| def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int, window_size: int): | ||||||
| # Validate input types and shapes | ||||||
| assert input.shape == (N, N) | ||||||
| assert output.shape == (1,) | ||||||
| assert input.dtype == torch.int32 | ||||||
| assert output.dtype == torch.int32 | ||||||
|
|
||||||
| psum = input.cumsum(dim=0).cumsum(dim=1) | ||||||
| padded = torch.zeros((N+1, N+1), dtype=torch.int32) | ||||||
| padded[1:, 1:] = psum | ||||||
|
|
||||||
| top_left = padded[:-window_size, :-window_size] | ||||||
| top_right = padded[:-window_size, window_size:] | ||||||
| bottom_left = padded[window_size:, :-window_size] | ||||||
| bottom_right = padded[window_size:, window_size:] | ||||||
| window_sums = bottom_right - top_right - bottom_left + top_left | ||||||
|
|
||||||
| max_sum = torch.max(window_sums) | ||||||
| output[0] = max_sum | ||||||
|
|
||||||
| def get_solve_signature(self) -> Dict[str, Any]: | ||||||
| return { | ||||||
| "input": ctypes.POINTER(ctypes.c_int), | ||||||
| "output": ctypes.POINTER(ctypes.c_int), | ||||||
| "N": ctypes.c_int, | ||||||
| "window_size": ctypes.c_int | ||||||
| } | ||||||
|
|
||||||
| def generate_example_test(self) -> Dict[str, Any]: | ||||||
| dtype = torch.int32 | ||||||
| input = torch.tensor([[1, 2, 3], [4, 5, 1], [5, 1, 7]], device="cuda", dtype=dtype) | ||||||
| output = torch.empty(1, device="cuda", dtype=dtype) | ||||||
| return { | ||||||
| "input": input, | ||||||
| "output": output, | ||||||
| "N": 3, | ||||||
| "window_size": 2 | ||||||
| } | ||||||
|
|
||||||
| def generate_functional_test(self) -> List[Dict[str, Any]]: | ||||||
| dtype = torch.int32 | ||||||
| tests = [] | ||||||
|
|
||||||
| # basic_example | ||||||
| tests.append({ | ||||||
| "input": torch.tensor([[-1, -2, -3], [-4, -5, -1], [-5, -1, -7]], device="cuda", dtype=dtype), | ||||||
| "output": torch.empty(1, device="cuda", dtype=dtype), | ||||||
| "N": 3, | ||||||
| "window_size": 2 | ||||||
| }) | ||||||
|
|
||||||
| # all_same_value | ||||||
| tests.append({ | ||||||
| "input": torch.tensor([[2]*16] * 16, device="cuda", dtype=dtype), | ||||||
| "output": torch.empty(1, device="cuda", dtype=dtype), | ||||||
| "N": 16, | ||||||
| "window_size": 16 | ||||||
| }) | ||||||
|
|
||||||
| tests.append({ | ||||||
| "input": torch.tensor([[2]*16] * 16, device="cuda", dtype=dtype), | ||||||
| "output": torch.empty(1, device="cuda", dtype=dtype), | ||||||
| "N": 16, | ||||||
| "window_size": 15 | ||||||
| }) | ||||||
|
|
||||||
| tests.append({ | ||||||
| "input": torch.tensor([[2]*16] * 16, device="cuda", dtype=dtype), | ||||||
| "output": torch.empty(1, device="cuda", dtype=dtype), | ||||||
| "N": 16, | ||||||
| "window_size": 1 | ||||||
| }) | ||||||
|
|
||||||
| # all_minus_value | ||||||
| tests.append({ | ||||||
| "input": torch.tensor([[-10]*10]*10, device="cuda", dtype=dtype), | ||||||
| "output": torch.empty(1, device="cuda", dtype=dtype), | ||||||
| "N": 10, | ||||||
| "window_size": 5 | ||||||
| }) | ||||||
|
|
||||||
| tests.append({ | ||||||
| "input": torch.randint(-10, 0, (123, 123), device="cuda", dtype=dtype), | ||||||
| "output": torch.empty(1, device="cuda", dtype=dtype), | ||||||
| "N": 123, | ||||||
| "window_size": 7 | ||||||
| }) | ||||||
|
|
||||||
| # increasing_sequence | ||||||
|
||||||
| # increasing_sequence | |
| # mixed_positive_negative |
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
| @@ -0,0 +1,6 @@ | ||||
| #include <cuda_runtime.h> | ||||
|
|
||||
| // input, output are device pointers (i.e. pointers to memory on the GPU) | ||||
| extern "C" void solve(const int* input, int* output, int N, int window_size) { | ||||
|
|
||||
|
||||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,9 @@ | ||||||
| from gpu.host import DeviceContext | ||||||
| from gpu.id import block_dim, block_idx, thread_idx | ||||||
| from memory import UnsafePointer | ||||||
| from math import ceildiv | ||||||
|
|
||||||
| # input, output are device pointers (i.e. pointers to memory on the GPU) | ||||||
| @export | ||||||
|
||||||
| @export | |
| @export |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| import torch | ||
|
|
||
| # input, output are tensors on the GPU | ||
| def solve(input: torch.Tensor, output: torch.Tensor, N: int, window_size: int): | ||
| pass |
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
| @@ -0,0 +1,7 @@ | ||||
| import torch | ||||
| import triton | ||||
|
||||
| import triton |
Copilot
AI
Jan 3, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Import of 'tl' is not used.
| import triton.language as tl |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The padded tensor is created on CPU by default with torch.zeros, but the input tensor is on GPU (cuda device). This will cause a runtime error when trying to assign psum to padded[1:, 1:] since tensors need to be on the same device. The padded tensor should be created on the same device as the input tensor.