Skip to content

Commit 5f7330c

Browse files
committed
Rename request to limits in Flavor
Signed-off-by: kerthcet <[email protected]>
1 parent 4634e23 commit 5f7330c

File tree

17 files changed

+74
-46
lines changed

17 files changed

+74
-46
lines changed

api/core/v1alpha1/model_types.go

+9-5
Original file line numberDiff line numberDiff line change
@@ -101,16 +101,16 @@ type FlavorName string
101101
type Flavor struct {
102102
// Name represents the flavor name, which will be used in model claim.
103103
Name FlavorName `json:"name"`
104-
// Requests defines the required accelerators to serve the model for each replica,
105-
// like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
104+
// Limits defines the required accelerators to serve the model for each replica,
105+
// like <nvidia.com/gpu: 8>. For multi-hosts cases, the limits here indicates
106106
// the resource requirements for each replica, usually equals to the TP size.
107107
// Not recommended to set the cpu and memory usage here:
108108
// - if using playground, you can define the cpu/mem usage at backendConfig.
109109
// - if using inference service, you can define the cpu/mem at the container resources.
110-
// However, if you define the same accelerator requests at playground/service as well,
111-
// the requests will be overwritten by the flavor requests.
110+
// However, if you define the same accelerator resources at playground/service as well,
111+
// the resources will be overwritten by the flavor limit here.
112112
// +optional
113-
Requests v1.ResourceList `json:"requests,omitempty"`
113+
Limits v1.ResourceList `json:"limits,omitempty"`
114114
// NodeSelector represents the node candidates for Pod placements, if a node doesn't
115115
// meet the nodeSelector, it will be filtered out in the resourceFungibility scheduler plugin.
116116
// If nodeSelector is empty, it means every node is a candidate.
@@ -129,11 +129,15 @@ type Flavor struct {
129129
type InferenceConfig struct {
130130
// Flavors represents the accelerator requirements to serve the model.
131131
// Flavors are fungible following the priority represented by the slice order.
132+
// This is used both in Playground and Inference Service.
132133
// +kubebuilder:validation:MaxItems=8
133134
// +optional
134135
Flavors []Flavor `json:"flavors,omitempty"`
135136
// SharedMemorySize represents the size of /dev/shm required in the runtime of
136137
// inference workload.
138+
// This is only used in Playground. Inference Service can configure the shared memory
139+
// directly in PodSpec.
140+
// +optional
137141
SharedMemorySize *resource.Quantity `json:"sharedMemorySize,omitempty"`
138142
}
139143

api/core/v1alpha1/zz_generated.deepcopy.go

+7-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

client-go/applyconfiguration/core/v1alpha1/flavor.go

+5-5
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

client-go/applyconfiguration/core/v1alpha1/inferenceconfig.go

+14-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/llmaz.io_openmodels.yaml

+20-17
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,31 @@ spec:
5454
description: |-
5555
Flavors represents the accelerator requirements to serve the model.
5656
Flavors are fungible following the priority represented by the slice order.
57+
This is used both in Playground and Inference Service.
5758
items:
5859
description: |-
5960
Flavor defines the accelerator requirements for a model and the necessary parameters
6061
in autoscaling. Right now, it will be used in two places:
6162
- Pod scheduling with node selectors specified.
6263
- Cluster autoscaling with essential parameters provided.
6364
properties:
65+
limits:
66+
additionalProperties:
67+
anyOf:
68+
- type: integer
69+
- type: string
70+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
71+
x-kubernetes-int-or-string: true
72+
description: |-
73+
Limits defines the required accelerators to serve the model for each replica,
74+
like <nvidia.com/gpu: 8>. For multi-hosts cases, the limits here indicates
75+
the resource requirements for each replica, usually equals to the TP size.
76+
Not recommended to set the cpu and memory usage here:
77+
- if using playground, you can define the cpu/mem usage at backendConfig.
78+
- if using inference service, you can define the cpu/mem at the container resources.
79+
However, if you define the same accelerator resources at playground/service as well,
80+
the resources will be overwritten by the flavor limit here.
81+
type: object
6482
name:
6583
description: Name represents the flavor name, which will
6684
be used in model claim.
@@ -83,23 +101,6 @@ spec:
83101
with <INSTANCE-TYPE: p4d.24xlarge> for AWS.
84102
Preset parameters: TP, PP, INSTANCE-TYPE.
85103
type: object
86-
requests:
87-
additionalProperties:
88-
anyOf:
89-
- type: integer
90-
- type: string
91-
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
92-
x-kubernetes-int-or-string: true
93-
description: |-
94-
Requests defines the required accelerators to serve the model for each replica,
95-
like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
96-
the resource requirements for each replica, usually equals to the TP size.
97-
Not recommended to set the cpu and memory usage here:
98-
- if using playground, you can define the cpu/mem usage at backendConfig.
99-
- if using inference service, you can define the cpu/mem at the container resources.
100-
However, if you define the same accelerator requests at playground/service as well,
101-
the requests will be overwritten by the flavor requests.
102-
type: object
103104
required:
104105
- name
105106
type: object
@@ -112,6 +113,8 @@ spec:
112113
description: |-
113114
SharedMemorySize represents the size of /dev/shm required in the runtime of
114115
inference workload.
116+
This is only used in Playground. Inference Service can configure the shared memory
117+
directly in PodSpec.
115118
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
116119
x-kubernetes-int-or-string: true
117120
type: object

docs/examples/hostpath/model.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,5 @@ spec:
1010
inferenceConfig:
1111
flavors:
1212
- name: t4 # GPU type
13-
requests:
13+
limits:
1414
nvidia.com/gpu: 1

docs/examples/huggingface/model.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,5 @@ spec:
1010
inferenceConfig:
1111
flavors:
1212
- name: t4 # GPU type
13-
requests:
13+
limits:
1414
nvidia.com/gpu: 1

docs/examples/modelscope/model.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,5 @@ spec:
1111
inferenceConfig:
1212
flavors:
1313
- name: t4 # GPU type
14-
requests:
14+
limits:
1515
nvidia.com/gpu: 1

docs/examples/multi-nodes/model.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@ spec:
1010
inferenceConfig:
1111
flavors:
1212
- name: a100-80gb
13-
requests:
13+
limits:
1414
nvidia.com/gpu: 8 # single node request
1515
params:
1616
TP: "8" # 8 GPUs per node, equal to nvidia.com/gpu
1717
PP: "2" # 2 nodes
1818
# - name: h100
19-
# requests:
19+
# limits:
2020
# nvidia.com/gpu: 8 # single node request
2121
# params:
2222
# TP: "8"

docs/examples/objstore-oss/model.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,5 @@ spec:
1111
inferenceConfig:
1212
flavors:
1313
- name: t4 # GPU type
14-
requests:
14+
limits:
1515
nvidia.com/gpu: 1

docs/examples/sglang/model.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,5 @@ spec:
1010
inferenceConfig:
1111
flavors:
1212
- name: t4 # GPU type
13-
requests:
13+
limits:
1414
nvidia.com/gpu: 1

docs/examples/speculative-decoding/vllm/model.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ spec:
1010
inferenceConfig:
1111
flavors:
1212
- name: a10 # gpu type
13-
requests:
13+
limits:
1414
nvidia.com/gpu: 1
1515
---
1616
apiVersion: llmaz.io/v1alpha1

docs/examples/tgi/model.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,5 @@ spec:
1010
inferenceConfig:
1111
flavors:
1212
- name: t4 # GPU type
13-
requests:
13+
limits:
1414
nvidia.com/gpu: 1

pkg/controller/inference/service_controller.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -201,8 +201,8 @@ func injectModelFlavor(template *corev1.PodTemplateSpec, model *coreapi.OpenMode
201201

202202
for i, flavor := range model.Spec.InferenceConfig.Flavors {
203203
if flavor.Name == flavorName {
204-
requests := model.Spec.InferenceConfig.Flavors[i].Requests
205-
for k, v := range requests {
204+
limits := model.Spec.InferenceConfig.Flavors[i].Limits
205+
for k, v := range limits {
206206
if container.Resources.Requests == nil {
207207
container.Resources.Requests = map[corev1.ResourceName]resource.Quantity{}
208208
}

pkg/controller_helper/helper.go

+3
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,9 @@ func FirstAssignedFlavor(model *coreapi.OpenModel, playground *inferenceapi.Play
121121
// the second one is whether this is a multi-host inference.
122122
func MultiHostInference(model *coreapi.OpenModel, playground *inferenceapi.Playground) (int32, bool) {
123123
flavors := FirstAssignedFlavor(model, playground)
124+
// This is not valid for all cases, like SGLang uses TP for model parallelism.
125+
// However, this is not a recommend way since TP requires more communication than PP.
126+
// It's ok to support PP only at this moment.
124127
if len(flavors) > 0 && flavors[0].Params["PP"] != "" {
125128
size, err := strconv.Atoi(flavors[0].Params["PP"])
126129
if err != nil {

test/util/validation/validate_service.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -174,9 +174,9 @@ func ValidateModelFlavor(service *inferenceapi.Service, model *coreapi.OpenModel
174174

175175
for _, flavor := range model.Spec.InferenceConfig.Flavors {
176176
if flavor.Name == flavorName {
177-
requests := flavor.Requests
177+
limits := flavor.Limits
178178
container := workload.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0]
179-
for k, v := range requests {
179+
for k, v := range limits {
180180
if !container.Resources.Requests[k].Equal(v) {
181181
return fmt.Errorf("unexpected request value %v, got %v", v, workload.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Resources.Requests[k])
182182
}

test/util/wrapper/model.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -133,10 +133,10 @@ func (w *FlavorWrapper) Obj() *coreapi.Flavor {
133133
}
134134

135135
func (w *FlavorWrapper) SetRequest(r, v string) *FlavorWrapper {
136-
if w.Requests == nil {
137-
w.Requests = map[v1.ResourceName]resource.Quantity{}
136+
if w.Limits == nil {
137+
w.Limits = map[v1.ResourceName]resource.Quantity{}
138138
}
139-
w.Requests[v1.ResourceName(r)] = resource.MustParse(v)
139+
w.Limits[v1.ResourceName(r)] = resource.MustParse(v)
140140
return w
141141
}
142142

0 commit comments

Comments
 (0)