Skip to content

Commit f4541b0

Browse files
Merge pull request #276 from kerthcet/document/api-reference
Add recommendedConfigs to backendRuntime
2 parents d5ec014 + a719916 commit f4541b0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+1324
-1410
lines changed

api/inference/v1alpha1/backendruntime_types.go

+27-34
Original file line numberDiff line numberDiff line change
@@ -19,22 +19,10 @@ package v1alpha1
1919
import (
2020
autoscalingv2 "k8s.io/api/autoscaling/v2"
2121
corev1 "k8s.io/api/core/v1"
22+
"k8s.io/apimachinery/pkg/api/resource"
2223
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2324
)
2425

25-
// BackendRuntimeArg is the preset arguments for easy to use.
26-
// Three preset names are provided: default, speculative-decoding, model-parallelism,
27-
// do not change the name.
28-
type BackendRuntimeArg struct {
29-
// Name represents the identifier of the backendRuntime argument.
30-
// +kubebuilder:default=default
31-
// +optional
32-
Name *string `json:"name,omitempty"`
33-
// Flags represents all the preset configurations.
34-
// Flag around with {{ .CONFIG }} is a configuration waiting for render.
35-
Flags []string `json:"flags,omitempty"`
36-
}
37-
3826
// HPATrigger represents the configuration of the HorizontalPodAutoscaler.
3927
// Inspired by kubernetes.io/pkg/apis/autoscaling/types.go#HorizontalPodAutoscalerSpec.
4028
// Note: HPA component should be installed in prior.
@@ -55,17 +43,6 @@ type HPATrigger struct {
5543
Behavior *autoscalingv2.HorizontalPodAutoscalerBehavior `json:"behavior,omitempty"`
5644
}
5745

58-
// NamedScaleTrigger defines the rules to scale the workloads.
59-
// Only one trigger cloud work at a time. The name is used to identify
60-
// the trigger in backendRuntime.
61-
type NamedScaleTrigger struct {
62-
// Name represents the identifier of the scale trigger, e.g. some triggers defined for
63-
// latency sensitive workloads, some are defined for throughput sensitive workloads.
64-
Name string `json:"name,omitempty"`
65-
// HPA represents the trigger configuration of the HorizontalPodAutoscaler.
66-
HPA *HPATrigger `json:"hpa,omitempty"`
67-
}
68-
6946
// ScaleTrigger defines the rules to scale the workloads.
7047
// Only one trigger cloud work at a time, mostly used in Playground.
7148
type ScaleTrigger struct {
@@ -83,6 +60,30 @@ type MultiHostCommands struct {
8360
Worker []string `json:"worker,omitempty"`
8461
}
8562

63+
// RecommendedConfig represents the recommended configurations for the backendRuntime,
64+
// user can choose one of them to apply.
65+
type RecommendedConfig struct {
66+
// Name represents the identifier of the config.
67+
Name string `json:"name"`
68+
// Args represents all the arguments for the command.
69+
// Argument around with {{ .CONFIG }} is a configuration waiting for render.
70+
// +optional
71+
Args []string `json:"args,omitempty"`
72+
// Resources represents the resource requirements for backend, like cpu/mem,
73+
// accelerators like GPU should not be defined here, but at the model flavors,
74+
// or the values here will be overwritten.
75+
// +optional
76+
Resources *ResourceRequirements `json:"resources,omitempty"`
77+
// SharedMemorySize represents the size of /dev/shm required in the runtime of
78+
// inference workload.
79+
// +optional
80+
SharedMemorySize *resource.Quantity `json:"sharedMemorySize,omitempty"`
81+
// ScaleTrigger defines the rules to scale the workloads.
82+
// Only one trigger cloud work at a time.
83+
// +optional
84+
ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"`
85+
}
86+
8687
// BackendRuntimeSpec defines the desired state of BackendRuntime
8788
type BackendRuntimeSpec struct {
8889
// Commands represents the default commands for the backendRuntime.
@@ -98,16 +99,9 @@ type BackendRuntimeSpec struct {
9899
// Version represents the default version of the backendRuntime.
99100
// It will be appended to the image as a tag.
100101
Version string `json:"version"`
101-
// Args represents the preset arguments of the backendRuntime.
102-
// They can be appended or overwritten by the Playground backendRuntimeConfig.
103-
Args []BackendRuntimeArg `json:"args,omitempty"`
104102
// Envs represents the environments set to the container.
105103
// +optional
106104
Envs []corev1.EnvVar `json:"envs,omitempty"`
107-
// Resources represents the resource requirements for backendRuntime, like cpu/mem,
108-
// accelerators like GPU should not be defined here, but at the model flavors,
109-
// or the values here will be overwritten.
110-
Resources ResourceRequirements `json:"resources"`
111105
// Periodic probe of backend liveness.
112106
// Backend will be restarted if the probe fails.
113107
// Cannot be updated.
@@ -124,10 +118,9 @@ type BackendRuntimeSpec struct {
124118
// when it might take a long time to load data or warm a cache, than during steady-state operation.
125119
// +optional
126120
StartupProbe *corev1.Probe `json:"startupProbe,omitempty"`
127-
// ScaleTriggers represents a set of triggers preset to be used by Playground.
128-
// If Playground not specify the scale trigger, the 0-index trigger will be used.
121+
// RecommendedConfigs represents the recommended configurations for the backendRuntime.
129122
// +optional
130-
ScaleTriggers []NamedScaleTrigger `json:"scaleTriggers,omitempty"`
123+
RecommendedConfigs []RecommendedConfig `json:"recommendedConfigs,omitempty"`
131124
}
132125

133126
// BackendRuntimeStatus defines the observed state of BackendRuntime

api/inference/v1alpha1/config_types.go

+20-36
Original file line numberDiff line numberDiff line change
@@ -28,29 +28,43 @@ const (
2828
)
2929

3030
type BackendRuntimeConfig struct {
31-
// Name represents the inference backend under the hood, e.g. vLLM.
31+
// BackendName represents the inference backend under the hood, e.g. vLLM.
3232
// +kubebuilder:default=vllm
3333
// +optional
34-
Name *BackendName `json:"name,omitempty"`
34+
BackendName *BackendName `json:"backendName,omitempty"`
3535
// Version represents the backend version if you want a different one
3636
// from the default version.
3737
// +optional
3838
Version *string `json:"version,omitempty"`
3939
// Envs represents the environments set to the container.
4040
// +optional
4141
Envs []corev1.EnvVar `json:"envs,omitempty"`
42-
42+
// ConfigName represents the recommended configuration name for the backend,
43+
// It will be inferred from the models in the runtime if not specified, e.g. default,
44+
// speculative-decoding or model-parallelism.
45+
ConfigName *string `json:"configName,omitempty"`
46+
// Args represents all the arguments for the command.
47+
// Argument around with {{ .CONFIG }} is a configuration waiting for render.
48+
// +optional
49+
// Args defined here will "append" the args in the recommendedConfig.
50+
// +optional
51+
Args []string `json:"args,omitempty"`
4352
// Resources represents the resource requirements for backend, like cpu/mem,
4453
// accelerators like GPU should not be defined here, but at the model flavors,
4554
// or the values here will be overwritten.
55+
// Resources defined here will "overwrite" the resources in the recommendedConfig.
56+
// +optional
4657
Resources *ResourceRequirements `json:"resources,omitempty"`
4758
// SharedMemorySize represents the size of /dev/shm required in the runtime of
4859
// inference workload.
60+
// SharedMemorySize defined here will "overwrite" the sharedMemorySize in the recommendedConfig.
4961
// +optional
5062
SharedMemorySize *resource.Quantity `json:"sharedMemorySize,omitempty"`
51-
// Args represents the specified arguments of the backendRuntime,
52-
// will be append to the backendRuntime.spec.Args.
53-
Args *BackendRuntimeArg `json:"args,omitempty"`
63+
// ScaleTrigger defines the rules to scale the workloads.
64+
// Only one trigger cloud work at a time, mostly used in Playground.
65+
// ScaleTrigger defined here will "overwrite" the scaleTrigger in the recommendedConfig.
66+
// +optional
67+
ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"`
5468
}
5569

5670
// TODO: Do not support DRA yet, we can support that once needed.
@@ -66,33 +80,3 @@ type ResourceRequirements struct {
6680
// +optional
6781
Requests corev1.ResourceList `json:"requests,omitempty"`
6882
}
69-
70-
// ScaleTriggerRef refers to the configured scaleTrigger in the backendRuntime.
71-
type ScaleTriggerRef struct {
72-
// Name represents the scale trigger name defined in the backendRuntime.scaleTriggers.
73-
Name string `json:"name"`
74-
}
75-
76-
type ElasticConfig struct {
77-
// MinReplicas indicates the minimum number of inference workloads based on the traffic.
78-
// Default to 1.
79-
// MinReplicas couldn't be 0 now, will support serverless in the future.
80-
// +kubebuilder:default=1
81-
// +optional
82-
MinReplicas *int32 `json:"minReplicas,omitempty"`
83-
// MaxReplicas indicates the maximum number of inference workloads based on the traffic.
84-
// Default to nil means there's no limit for the instance number.
85-
// +optional
86-
MaxReplicas *int32 `json:"maxReplicas,omitempty"`
87-
// ScaleTriggerRef refers to the configured scaleTrigger in the backendRuntime
88-
// with tuned target value.
89-
// ScaleTriggerRef and ScaleTrigger can't be set at the same time.
90-
// +optional
91-
ScaleTriggerRef *ScaleTriggerRef `json:"scaleTriggerRef,omitempty"`
92-
// ScaleTrigger defines a set of triggers to scale the workloads.
93-
// If not defined, trigger configured in backendRuntime will be used,
94-
// otherwise, trigger defined here will overwrite the defaulted ones.
95-
// ScaleTriggerRef and ScaleTrigger can't be set at the same time.
96-
// +optional
97-
ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"`
98-
}

api/inference/v1alpha1/playground_types.go

+13-2
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,22 @@ type PlaygroundSpec struct {
4444
BackendRuntimeConfig *BackendRuntimeConfig `json:"backendRuntimeConfig,omitempty"`
4545
// ElasticConfig defines the configuration for elastic usage,
4646
// e.g. the max/min replicas.
47-
// Note: this requires to install the HPA first or will report error.
48-
// +optional
4947
ElasticConfig *ElasticConfig `json:"elasticConfig,omitempty"`
5048
}
5149

50+
type ElasticConfig struct {
51+
// MinReplicas indicates the minimum number of inference workloads based on the traffic.
52+
// Default to 1.
53+
// MinReplicas couldn't be 0 now, will support serverless in the future.
54+
// +kubebuilder:default=1
55+
// +optional
56+
MinReplicas *int32 `json:"minReplicas,omitempty"`
57+
// MaxReplicas indicates the maximum number of inference workloads based on the traffic.
58+
// Default to nil means there's no limit for the instance number.
59+
// +optional
60+
MaxReplicas *int32 `json:"maxReplicas,omitempty"`
61+
}
62+
5263
const (
5364
// PlaygroundProgressing means the Playground is progressing now, such as waiting for the
5465
// inference service creation, rolling update or scaling up and down.

0 commit comments

Comments
 (0)