Release v0.1.1

kerthcet · kerthcet · commit 701b9ab6dcef · 2025-02-18T14:47:10.000+08:00
Signed-off-by: kerthcet &lt;kerthcet@gmail.com&gt;
diff --git a/chart/Chart.yaml b/chart/Chart.yaml
@@ -13,9 +13,9 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.0.6
+version: 0.0.7
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: 0.1.0
+appVersion: 0.1.1
diff --git a/chart/crds/backendruntime-crd.yaml b/chart/crds/backendruntime-crd.yaml
diff --git a/chart/crds/openmodel-crd.yaml b/chart/crds/openmodel-crd.yaml
@@ -70,6 +70,23 @@ spec:
                         - Pod scheduling with node selectors specified.
                         - Cluster autoscaling with essential parameters provided.
                       properties:
+                        limits:
+                          additionalProperties:
+                            anyOf:
+                            - type: integer
+                            - type: string
+                            pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                            x-kubernetes-int-or-string: true
+                          description: |-
+                            Limits defines the required accelerators to serve the model for each replica,
+                            like <nvidia.com/gpu: 8>. For multi-hosts cases, the limits here indicates
+                            the resource requirements for each replica, usually equals to the TP size.
+                            Not recommended to set the cpu and memory usage here:
+                            - if using playground, you can define the cpu/mem usage at backendConfig.
+                            - if using inference service, you can define the cpu/mem at the container resources.
+                            However, if you define the same accelerator resources at playground/service as well,
+                            the resources will be overwritten by the flavor limit here.
+                          type: object
                         name:
                           description: Name represents the flavor name, which will
                             be used in model claim.
@@ -92,23 +109,6 @@ spec:
                             with <INSTANCE-TYPE: p4d.24xlarge> for AWS.
                             Preset parameters: TP, PP, INSTANCE-TYPE.
                           type: object
-                        requests:
-                          additionalProperties:
-                            anyOf:
-                            - type: integer
-                            - type: string
-                            pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
-                            x-kubernetes-int-or-string: true
-                          description: |-
-                            Requests defines the required accelerators to serve the model for each replica,
-                            like <nvidia.com/gpu: 8>. For multi-hosts cases, the requests here indicates
-                            the resource requirements for each replica, usually equals to the TP size.
-                            Not recommended to set the cpu and memory usage here:
-                            - if using playground, you can define the cpu/mem usage at backendConfig.
-                            - if using inference service, you can define the cpu/mem at the container resources.
-                            However, if you define the same accelerator requests at playground/service as well,
-                            the requests will be overwritten by the flavor requests.
-                          type: object
                       required:
                       - name
                       type: object
diff --git a/chart/crds/playground-crd.yaml b/chart/crds/playground-crd.yaml
@@ -47,22 +47,23 @@ spec:
                 properties:
                   args:
                     description: |-
-                      Args represents the specified arguments of the backendRuntime,
-                      will be append to the backendRuntime.spec.Args.
-                    properties:
-                      flags:
-                        description: |-
-                          Flags represents all the preset configurations.
-                          Flag around with {{ .CONFIG }} is a configuration waiting for render.
-                        items:
-                          type: string
-                        type: array
-                      name:
-                        default: default
-                        description: Name represents the identifier of the backendRuntime
-                          argument.
-                        type: string
-                    type: object
+                      Args represents all the arguments for the command.
+                      Argument around with {{ .CONFIG }} is a configuration waiting for render.
+                      Args defined here will "append" the args in the recommendedConfig.
+                    items:
+                      type: string
+                    type: array
+                  backendName:
+                    default: vllm
+                    description: BackendName represents the inference backend under
+                      the hood, e.g. vLLM.
+                    type: string
+                  configName:
+                    description: |-
+                      ConfigName represents the recommended configuration name for the backend,
+                      It will be inferred from the models in the runtime if not specified, e.g. default,
+                      speculative-decoding or model-parallelism.
+                    type: string
                   envs:
                     description: Envs represents the environments set to the container.
                     items:
@@ -183,16 +184,12 @@ spec:
                       - name
                       type: object
                     type: array
-                  name:
-                    default: vllm
-                    description: Name represents the inference backend under the hood,
-                      e.g. vLLM.
-                    type: string
                   resources:
                     description: |-
                       Resources represents the resource requirements for backend, like cpu/mem,
                       accelerators like GPU should not be defined here, but at the model flavors,
                       or the values here will be overwritten.
+                      Resources defined here will "overwrite" the resources in the recommendedConfig.
                     properties:
                       limits:
                         additionalProperties:
@@ -219,38 +216,11 @@ spec:
                           More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
                         type: object
                     type: object
-                  version:
-                    description: |-
-                      Version represents the backend version if you want a different one
-                      from the default version.
-                    type: string
-                type: object
-              elasticConfig:
-                description: |-
-                  ElasticConfig defines the configuration for elastic usage,
-                  e.g. the max/min replicas.
-                  Note: this requires to install the HPA first or will report error.
-                properties:
-                  maxReplicas:
-                    description: |-
-                      MaxReplicas indicates the maximum number of inference workloads based on the traffic.
-                      Default to nil means there's no limit for the instance number.
-                    format: int32
-                    type: integer
-                  minReplicas:
-                    default: 1
-                    description: |-
-                      MinReplicas indicates the minimum number of inference workloads based on the traffic.
-                      Default to 1.
-                      MinReplicas couldn't be 0 now, will support serverless in the future.
-                    format: int32
-                    type: integer
                   scaleTrigger:
                     description: |-
-                      ScaleTrigger defines a set of triggers to scale the workloads.
-                      If not defined, trigger configured in backendRuntime will be used,
-                      otherwise, trigger defined here will overwrite the defaulted ones.
-                      ScaleTriggerRef and ScaleTrigger can't be set at the same time.
+                      ScaleTrigger defines the rules to scale the workloads.
+                      Only one trigger cloud work at a time, mostly used in Playground.
+                      ScaleTrigger defined here will "overwrite" the scaleTrigger in the recommendedConfig.
                     properties:
                       hpa:
                         description: HPA represents the trigger configuration of the
@@ -859,19 +829,41 @@ spec:
                             type: array
                         type: object
                     type: object
-                  scaleTriggerRef:
+                  sharedMemorySize:
+                    anyOf:
+                    - type: integer
+                    - type: string
                     description: |-
-                      ScaleTriggerRef refers to the configured scaleTrigger in the backendRuntime
-                      with tuned target value.
-                      ScaleTriggerRef and ScaleTrigger can't be set at the same time.
-                    properties:
-                      name:
-                        description: Name represents the scale trigger name defined
-                          in the backendRuntime.scaleTriggers.
-                        type: string
-                    required:
-                    - name
-                    type: object
+                      SharedMemorySize represents the size of /dev/shm required in the runtime of
+                      inference workload.
+                      SharedMemorySize defined here will "overwrite" the sharedMemorySize in the recommendedConfig.
+                    pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                    x-kubernetes-int-or-string: true
+                  version:
+                    description: |-
+                      Version represents the backend version if you want a different one
+                      from the default version.
+                    type: string
+                type: object
+              elasticConfig:
+                description: |-
+                  ElasticConfig defines the configuration for elastic usage,
+                  e.g. the max/min replicas.
+                properties:
+                  maxReplicas:
+                    description: |-
+                      MaxReplicas indicates the maximum number of inference workloads based on the traffic.
+                      Default to nil means there's no limit for the instance number.
+                    format: int32
+                    type: integer
+                  minReplicas:
+                    default: 1
+                    description: |-
+                      MinReplicas indicates the minimum number of inference workloads based on the traffic.
+                      Default to 1.
+                      MinReplicas couldn't be 0 now, will support serverless in the future.
+                    format: int32
+                    type: integer
                 type: object
               modelClaim:
                 description: |-
diff --git a/chart/templates/manager-rbac.yaml b/chart/templates/manager-rbac.yaml
@@ -14,6 +14,18 @@ rules:
   - list
   - update
   - watch
+- apiGroups:
+  - ""
+  resources:
+  - services
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
 - apiGroups:
   - admissionregistration.k8s.io
   resources:
diff --git a/chart/values.yaml b/chart/values.yaml
@@ -33,7 +33,7 @@ controllerManager:
         - ALL
     image:
       repository: inftyai/llmaz
-      tag: v0.1.0
+      tag: v0.1.1
     resources:
       limits:
         cpu: 500m
diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml
@@ -5,4 +5,4 @@ kind: Kustomization
 images:
 - name: controller
   newName: inftyai/llmaz
-  newTag: v0.1.0
+  newTag: v0.1.1
diff --git a/docs/installation.md b/docs/installation.md
@@ -12,7 +12,7 @@
 ```cmd
 helm repo add inftyai https://inftyai.github.io/llmaz
 helm repo update
-helm install llmaz inftyai/llmaz --namespace llmaz-system --create-namespace --version 0.0.6
+helm install llmaz inftyai/llmaz --namespace llmaz-system --create-namespace --version 0.0.7
 ```
 
 ### Uninstall
diff --git a/index.yaml b/index.yaml
@@ -1,6 +1,16 @@
 apiVersion: v1
 entries:
   llmaz:
+  - apiVersion: v2
+    appVersion: 0.1.1
+    created: "2025-02-18T14:46:30.474789+08:00"
+    description: A Helm chart for llmaz
+    digest: b30ba8a78986cba95256d4869f4f5bd0bd79c5d25867497021b80ae5f1ee04f0
+    name: llmaz
+    type: application
+    urls:
+    - https://inftyai.github.io/llmaz/llmaz-0.0.7.tgz
+    version: 0.0.7
   - apiVersion: v2
     appVersion: 0.1.0
     created: "2025-01-25T01:22:38.666093+08:00"
@@ -61,4 +71,4 @@ entries:
     urls:
     - https://inftyai.github.io/llmaz/llmaz-0.0.1.tgz
     version: 0.0.1
-generated: "2025-01-25T01:22:38.647336+08:00"
+generated: "2025-02-18T14:46:30.460221+08:00"