Skip to content

Commit 2f3f7b0

Browse files
authored
fix(api): add GPU resource allocation support and fix pod request/limit calculations (#10368)
* feat(pod): add GPU resource allocation support and refactor request/limit functions * test(pod): add GPU allocation resource validation in unit tests
1 parent 1d1d9f4 commit 2f3f7b0

File tree

8 files changed

+204
-50
lines changed

8 files changed

+204
-50
lines changed

modules/api/pkg/resource/node/detail.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ func getNodeAllocatedResources(node v1.Node, podList *v1.PodList) (NodeAllocated
170170
reqs, limits := map[v1.ResourceName]resource.Quantity{}, map[v1.ResourceName]resource.Quantity{}
171171

172172
for _, p := range podList.Items {
173-
podReqs, podLimits, err := pod.PodRequestsAndLimits(&p)
173+
podReqs, podLimits, err := pod.RequestsAndLimits(&p)
174174
if err != nil {
175175
return NodeAllocatedResources{}, err
176176
}

modules/api/pkg/resource/pod/common.go

Lines changed: 33 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ package pod
1717
import (
1818
"fmt"
1919

20+
"github.com/samber/lo"
2021
v1 "k8s.io/api/core/v1"
2122

2223
metricapi "k8s.io/dashboard/api/pkg/integration/metric/api"
@@ -289,11 +290,11 @@ func maxResourceList(list, new v1.ResourceList) {
289290
}
290291
}
291292

292-
// PodRequestsAndLimits returns a dictionary of all defined resources summed up for all
293+
// RequestsAndLimits returns a dictionary of all defined resources summed up for all
293294
// containers of the pod. If pod overhead is non-nil, the pod overhead is added to the
294295
// total container resource requests and to the total container limits which have a
295296
// non-zero quantity.
296-
func PodRequestsAndLimits(pod *v1.Pod) (reqs, limits v1.ResourceList, err error) {
297+
func RequestsAndLimits(pod *v1.Pod) (reqs, limits v1.ResourceList, err error) {
297298
reqs, limits = v1.ResourceList{}, v1.ResourceList{}
298299
for _, container := range pod.Spec.Containers {
299300
addResourceList(reqs, container.Resources.Requests)
@@ -320,39 +321,49 @@ func PodRequestsAndLimits(pod *v1.Pod) (reqs, limits v1.ResourceList, err error)
320321
}
321322

322323
func getPodAllocatedResources(pod *v1.Pod) (PodAllocatedResources, error) {
323-
reqs, limits, err := PodRequestsAndLimits(pod)
324+
reqs, limits, err := RequestsAndLimits(pod)
324325
if err != nil {
325326
return PodAllocatedResources{}, err
326327
}
327328

328-
for podReqName, podReqValue := range reqs {
329-
if value, ok := reqs[podReqName]; !ok {
330-
reqs[podReqName] = podReqValue.DeepCopy()
331-
} else {
332-
value.Add(podReqValue)
333-
reqs[podReqName] = value
334-
}
335-
}
336-
337-
for podLimitName, podLimitValue := range limits {
338-
if value, ok := limits[podLimitName]; !ok {
339-
limits[podLimitName] = podLimitValue.DeepCopy()
340-
} else {
341-
value.Add(podLimitValue)
342-
limits[podLimitName] = value
343-
}
344-
}
345-
346-
cpuRequests, cpuLimits, memoryRequests, memoryLimits := reqs[v1.ResourceCPU], limits[v1.ResourceCPU], reqs[v1.ResourceMemory], limits[v1.ResourceMemory]
329+
cpuRequests := reqs[v1.ResourceCPU]
330+
cpuLimits := limits[v1.ResourceCPU]
331+
memoryRequests := reqs[v1.ResourceMemory]
332+
memoryLimits := limits[v1.ResourceMemory]
347333

348334
return PodAllocatedResources{
349335
CPURequests: cpuRequests.MilliValue(),
350336
CPULimits: cpuLimits.MilliValue(),
351337
MemoryRequests: memoryRequests.Value(),
352338
MemoryLimits: memoryLimits.Value(),
339+
GPURequests: toGPUAllocations(reqs),
340+
GPULimits: toGPUAllocations(limits),
353341
}, nil
354342
}
355343

344+
func toGPUAllocations(resources v1.ResourceList) []GPUAllocation {
345+
result := make([]GPUAllocation, 0)
346+
for resource, quantity := range resources {
347+
if gpuType := ToGPU(string(resource)); gpuType != NoGPU {
348+
result = append(result, GPUAllocation{
349+
Quantity: quantity.Value(),
350+
Type: gpuType,
351+
})
352+
}
353+
}
354+
355+
return lo.Reduce(result, func(acc []GPUAllocation, item GPUAllocation, _ int) []GPUAllocation {
356+
for i, existing := range acc {
357+
if existing.Type == item.Type {
358+
acc[i].Quantity += item.Quantity
359+
return acc
360+
}
361+
}
362+
363+
return append(acc, item)
364+
}, []GPUAllocation{})
365+
}
366+
356367
func isPodInitializedConditionTrue(status *v1.PodStatus) bool {
357368
for _, condition := range status.Conditions {
358369
if condition.Type != v1.PodInitialized {

modules/api/pkg/resource/pod/common_test.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,10 @@ func TestToPodPodStatusFailed(t *testing.T) {
4444
Status: string(v1.PodFailed),
4545
Warnings: []common.Event{},
4646
ContainerStatuses: make([]ContainerStatus, 0),
47+
AllocatedResources: PodAllocatedResources{
48+
GPURequests: []GPUAllocation{},
49+
GPULimits: []GPUAllocation{},
50+
},
4751
}
4852

4953
actual := toPod(pod, &MetricsByPod{}, []common.Event{})
@@ -72,6 +76,10 @@ func TestToPodPodStatusSucceeded(t *testing.T) {
7276
Status: string(v1.PodSucceeded),
7377
Warnings: []common.Event{},
7478
ContainerStatuses: make([]ContainerStatus, 0),
79+
AllocatedResources: PodAllocatedResources{
80+
GPURequests: []GPUAllocation{},
81+
GPULimits: []GPUAllocation{},
82+
},
7583
}
7684

7785
actual := toPod(pod, &MetricsByPod{}, []common.Event{})
@@ -104,6 +112,10 @@ func TestToPodPodStatusRunning(t *testing.T) {
104112
Status: string(v1.PodRunning),
105113
Warnings: []common.Event{},
106114
ContainerStatuses: make([]ContainerStatus, 0),
115+
AllocatedResources: PodAllocatedResources{
116+
GPURequests: []GPUAllocation{},
117+
GPULimits: []GPUAllocation{},
118+
},
107119
}
108120

109121
actual := toPod(pod, &MetricsByPod{}, []common.Event{})
@@ -132,6 +144,10 @@ func TestToPodPodStatusPending(t *testing.T) {
132144
Status: string(v1.PodPending),
133145
Warnings: []common.Event{},
134146
ContainerStatuses: make([]ContainerStatus, 0),
147+
AllocatedResources: PodAllocatedResources{
148+
GPURequests: []GPUAllocation{},
149+
GPULimits: []GPUAllocation{},
150+
},
135151
}
136152

137153
actual := toPod(pod, &MetricsByPod{}, []common.Event{})
@@ -177,6 +193,10 @@ func TestToPodContainerStates(t *testing.T) {
177193
State: Waiting,
178194
},
179195
},
196+
AllocatedResources: PodAllocatedResources{
197+
GPURequests: []GPUAllocation{},
198+
GPULimits: []GPUAllocation{},
199+
},
180200
}
181201

182202
actual := toPod(pod, &MetricsByPod{}, []common.Event{})
@@ -199,6 +219,10 @@ func TestToPod(t *testing.T) {
199219
TypeMeta: types.TypeMeta{Kind: types.ResourceKindPod},
200220
Warnings: []common.Event{},
201221
ContainerStatuses: make([]ContainerStatus, 0),
222+
AllocatedResources: PodAllocatedResources{
223+
GPURequests: []GPUAllocation{},
224+
GPULimits: []GPUAllocation{},
225+
},
202226
},
203227
}, {
204228
pod: &v1.Pod{
@@ -214,6 +238,10 @@ func TestToPod(t *testing.T) {
214238
},
215239
Warnings: []common.Event{},
216240
ContainerStatuses: make([]ContainerStatus, 0),
241+
AllocatedResources: PodAllocatedResources{
242+
GPURequests: []GPUAllocation{},
243+
GPULimits: []GPUAllocation{},
244+
},
217245
},
218246
},
219247
}

modules/api/pkg/resource/pod/list.go

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
package pod
1616

1717
import (
18+
"strings"
19+
1820
v1 "k8s.io/api/core/v1"
1921
metaV1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2022
k8sClient "k8s.io/client-go/kubernetes"
@@ -91,6 +93,55 @@ type PodAllocatedResources struct {
9193

9294
// MemoryLimits is defined memory limit.
9395
MemoryLimits int64 `json:"memoryLimits"`
96+
97+
// GPURequests is a number and type of requested GPUs.
98+
GPURequests []GPUAllocation `json:"gpuRequests"`
99+
100+
// GPULimits is a limit number and type of requested GPUs.
101+
GPULimits []GPUAllocation `json:"gpuLimits"`
102+
}
103+
104+
type GPU string
105+
106+
const (
107+
NoGPU GPU = ""
108+
UnknownGPU GPU = "unknown"
109+
NvidiaGPU GPU = "nvidia"
110+
AMDGPU GPU = "amd"
111+
IntelGPU GPU = "intel"
112+
113+
NvidiaLabel = "nvidia.com/gpu"
114+
AMDLabel = "amd.com/gpu"
115+
// IntelLabel is for a partial match only, and it should be checked if it starts with this prefix.
116+
IntelLabel = "gpu.intel.com"
117+
)
118+
119+
func ToGPU(gpuType string) GPU {
120+
switch gpuType {
121+
case NvidiaLabel:
122+
return NvidiaGPU
123+
case AMDLabel:
124+
return AMDGPU
125+
}
126+
127+
if strings.HasPrefix(gpuType, IntelLabel) {
128+
return IntelGPU
129+
}
130+
131+
if strings.Contains(gpuType, "gpu") {
132+
return UnknownGPU
133+
}
134+
135+
return NoGPU
136+
}
137+
138+
// GPUAllocation describes GPU allocation.
139+
type GPUAllocation struct {
140+
// Quantity is a number of requested GPUs.
141+
Quantity int64 `json:"quantity"`
142+
143+
// Type of GPU.
144+
Type GPU `json:"type"`
94145
}
95146

96147
var EmptyPodList = &PodList{

modules/api/pkg/resource/pod/list_test.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,10 @@ func TestGetPodListFromChannels(t *testing.T) {
105105
TypeMeta: types.TypeMeta{Kind: types.ResourceKindPod},
106106
Warnings: []common.Event{},
107107
ContainerStatuses: make([]pod.ContainerStatus, 0),
108+
AllocatedResources: pod.PodAllocatedResources{
109+
GPURequests: []pod.GPUAllocation{},
110+
GPULimits: []pod.GPUAllocation{},
111+
},
108112
}},
109113
Errors: []error{},
110114
},
@@ -139,3 +143,23 @@ func TestGetPodListFromChannels(t *testing.T) {
139143
}
140144
}
141145
}
146+
147+
func TestToGPU(t *testing.T) {
148+
cases := []struct {
149+
in string
150+
out pod.GPU
151+
}{
152+
{"nvidia.com/gpu", pod.NvidiaGPU},
153+
{"amd.com/gpu", pod.AMDGPU},
154+
{"gpu.intel.com/xe", pod.IntelGPU},
155+
{"gpu.intel.com/iris", pod.IntelGPU},
156+
{"unknown.gpu.type", pod.UnknownGPU},
157+
{"", pod.NoGPU},
158+
}
159+
160+
for _, c := range cases {
161+
if gpuType := pod.ToGPU(c.in); gpuType != c.out {
162+
t.Errorf("ToGPU(%q) == %q, expected %q", c.in, gpuType, c.out)
163+
}
164+
}
165+
}

modules/api/pkg/resource/service/pods_test.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,14 +61,18 @@ func TestGetServicePods(t *testing.T) {
6161
TypeMeta: types.TypeMeta{Kind: types.ResourceKindPod},
6262
Warnings: []common.Event{},
6363
ContainerStatuses: make([]pod.ContainerStatus, 0),
64+
AllocatedResources: pod.PodAllocatedResources{
65+
GPURequests: []pod.GPUAllocation{},
66+
GPULimits: []pod.GPUAllocation{},
67+
},
6468
},
6569
},
6670
Errors: []error{},
6771
},
6872
},
6973
}
7074
for _, c := range cases {
71-
fakeClient := fake.NewSimpleClientset(c.service, c.podList)
75+
fakeClient := fake.NewClientset(c.service, c.podList)
7276

7377
actual, _ := GetServicePods(fakeClient, nil, c.namespace, c.name, dataselect.NoDataSelect)
7478
if !reflect.DeepEqual(actual, c.expected) {

0 commit comments

Comments
 (0)