@@ -310,7 +310,7 @@ The mapping of DRA devices and extended resources is stored in k8s data store
310
310
application that uses the devices.
311
311
312
312
` ` ` go
313
- // DeviceClassSpec is used in a [ DeviceClass] to define what can be allocated
313
+ // DeviceClassSpec is used in a DeviceClass to define what can be allocated
314
314
// and how to configure it.
315
315
type DeviceClassSpec struct {
316
316
// ExtendedResourceName defines a mapping to the extended resource API.
@@ -379,7 +379,7 @@ type DeviceRequest struct {
379
379
// Must be a DNS label.
380
380
//
381
381
// +required
382
- Name string ` json:"name" protobuf:"bytes,1,name=name"`
382
+ Name string
383
383
}
384
384
` ` `
385
385
@@ -408,28 +408,28 @@ to the containers in the pod.
408
408
// resource requests backed by DRA. It stores the generated name for
409
409
// the corresponding special ResourceClaim created by scheduler.
410
410
type PodExtendedResourceClaimStatus struct {
411
- // Names identifies the mapping of <container, extended resource backed by DRA> to device request.
411
+ // ResourceClaimName is the name of the ResourceClaim that was
412
+ // generated for the Pod in the namespace of the Pod.
413
+ ResourceClaimName string
414
+
415
+ // RequestMapping identifies the mapping of <container, extended resource backed by DRA> to device request.
412
416
// +patchMergeKey=requestName
413
417
// +patchStrategy=merge,retainKeys
414
418
// +listType=atomic
415
419
// +listMapKey=requestName
416
420
// +featureGate=DynamicResourceAllocation
417
- Names []ContainerExtendedResourceRequest ` json:"names" patchStrategy:"merge,retainKeys" patchMergeKey:"requestName" protobuf:"bytes,1,rep,name=names"`
418
-
419
- // ResourceClaimName is the name of the ResourceClaim that was
420
- // generated for the Pod in the namespace of the Pod.
421
- ResourceClaimName string ` json:"resourceClaimName" protobuf:"bytes,2,name=resourceClaimName"`
421
+ RequestMapping []ContainerExtendedResourceRequest
422
422
}
423
423
424
424
type ContainerExtendedResourceRequest struct {
425
425
// ContainerName is the unique container name within the pod.
426
- ContainerName string ` json:"containerName" protobuf:"bytes,1,name=containerName" `
426
+ ContainerName string
427
427
// ExtendedResourceName is the extended resource name backed by DRA inside
428
428
// the container's requests.
429
- ExtendedResourceName string ` json:"extendedResourceName" protobuf:"bytes,2,name=extendedResourceName" `
429
+ ExtendedResourceName string
430
430
// RequestName is the device request name in the special resource claim
431
431
// created for extended resource requests backed by DRA.
432
- RequestName string ` json:"requestName" protobuf:"bytes,3,name=requestName" `
432
+ RequestName string
433
433
}
434
434
435
435
type PodStatus struct {
@@ -449,11 +449,11 @@ then the pod's status is like below:
449
449
` ` ` yaml
450
450
status:
451
451
extendedResourceClaimStatus:
452
- names :
452
+ resourceClaimName: ccc-gpu-57999b9c4c-vpq68-gpu-8s27z
453
+ requestMapping:
453
454
- containerName: container-name
454
455
extendedResourceName: foo.domain/bar
455
456
requestName: container-0-request-2
456
- resourceClaimName : ccc-gpu-57999b9c4c-vpq68-gpu-8s27z
457
457
` ` `
458
458
where `deviceRequest` name is "container-0-request-2", and container-name is the first container
459
459
in the pod, foo.domain/bar is the 3rd extended resource in the container's requests.
@@ -462,7 +462,7 @@ Note the validations for extendedResourceClaimStatus are different from the
462
462
validations for resourceClaimStatuses.
463
463
464
464
1. resourceClaimStatuses requires `name` must be DNS label,
465
- extendedResourceClaimStatus's names' `containerName` and `RequestName` must
465
+ extendedResourceClaimStatus's requestMapping's `containerName` and `RequestName` must
466
466
be a DNS label, while the `extendedResourceName` is not a DNS label.
467
467
1. resourceClaimStatuses requires `name` must be one of the claim's name in the
468
468
pod spec. extendedResourceClaimStatus requires `containerName` must be one
@@ -963,7 +963,11 @@ For each of them, fill in the following information by copying the below templat
963
963
Not required until feature graduated to beta.
964
964
- Testing : Are there any tests for failure mode? If not, describe why.
965
965
-->
966
- Will be considered for beta.
966
+ - [Pod pending due to extended resource backed by DRA requests no less than 128 devices]
967
+ - Detection : inspect pod status 'Pending'
968
+ - Mitigations : reduce the number of devices requested in one extended resource backed by DRA requests
969
+ - Diagnostics : scheduler logs at level 5 show the reason for the scheduling failure.
970
+ - Testing : Will be considered for beta.
967
971
968
972
# ##### What steps should be taken if SLOs are not being met to determine the problem?
969
973
0 commit comments