feat(gemma-server): add enums for weights (#19)

apeabody · web-flow · commit 893f6d4523cf · 2025-03-24T17:15:43.000-07:00
diff --git a/.github/workflows/helm-charts.yaml b/.github/workflows/helm-charts.yaml
@@ -0,0 +1,31 @@
+name: Helm Charts
+
+on:
+  pull_request:
+    paths:
+      - 'fleet-charts/**'
+
+jobs:
+  lint-test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Set up Helm
+        uses: azure/setup-helm@v4.2.0
+        with:
+          version: v3.17.0
+
+      - uses: actions/setup-python@v5.3.0
+        with:
+          python-version: '3.x'
+          check-latest: true
+
+      - name: Set up chart-testing
+        uses: helm/chart-testing-action@v2.7.0
+
+      - name: Run chart-linting
+        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs fleet-charts --validate-maintainers=false --helm-lint-extra-args '--set hf_api_token=NONE'
diff --git a/fleet-charts/gemma-server/Chart.yaml b/fleet-charts/gemma-server/Chart.yaml
@@ -14,6 +14,6 @@
 
 apiVersion: v2
 name: gemma-server
-version: 0.0.1
+version: 0.0.2
 description: Serve Gemma 3 open models using GPUs on GKE with vLLM
 type: application
diff --git a/fleet-charts/gemma-server/templates/deployment.yaml b/fleet-charts/gemma-server/templates/deployment.yaml
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Patterned on https://raw.githubusercontent.com/GoogleCloudPlatform/kubernetes-engine-samples/c70138cd0a441bb168307000dfb4c698918c15a7/ai-ml/llm-serving-gemma/vllm/vllm-3-1b-it.yaml
+# Patterned on:
+# - https://raw.githubusercontent.com/GoogleCloudPlatform/kubernetes-engine-samples/c70138cd0a441bb168307000dfb4c698918c15a7/ai-ml/llm-serving-gemma/vllm/vllm-3-1b-it.yaml
+# - https://raw.githubusercontent.com/GoogleCloudPlatform/kubernetes-engine-samples/c70138cd0a441bb168307000dfb4c698918c15a7/ai-ml/llm-serving-gemma/vllm/vllm-3-4b-it.yaml
+# - https://raw.githubusercontent.com/GoogleCloudPlatform/kubernetes-engine-samples/c70138cd0a441bb168307000dfb4c698918c15a7/ai-ml/llm-serving-gemma/vllm/vllm-3-14b-it.yaml
+# - https://raw.githubusercontent.com/GoogleCloudPlatform/kubernetes-engine-samples/c70138cd0a441bb168307000dfb4c698918c15a7/ai-ml/llm-serving-gemma/vllm/vllm-3-27b-it.yaml
 
 apiVersion: apps/v1
 kind: Deployment
@@ -32,26 +36,91 @@ spec:
         ai.gke.io/model: "gemma-3-{{ .Values.weight }}-it"
         ai.gke.io/inference-server: vllm
     spec:
-      containers:
+      containers: {{ if eq .Values.weight "1b" }}
       - name: inference-server
         image: "{{ .Values.image }}"
         resources:
           requests:
-            cpu: "{{ .Values.cpu }}"
-            memory: "{{ .Values.memory }}"
-            ephemeral-storage: "{{ .Values.storage }}"
-            nvidia.com/gpu: "{{ .Values.gpu }}"
+            cpu: "2"
+            memory: "10Gi"
+            ephemeral-storage: "10Gi"
+            nvidia.com/gpu: "1"
           limits:
-            cpu: "{{ .Values.cpu }}"
-            memory: "{{ .Values.memory }}"
-            ephemeral-storage: "{{ .Values.storage }}"
-            nvidia.com/gpu: "{{ .Values.gpu }}"
+            cpu: "2"
+            memory: "10Gi"
+            ephemeral-storage: "10Gi"
+            nvidia.com/gpu: "1"
+        command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+        args:
+        - --model=$(MODEL_ID)
+        - --tensor-parallel-size=1
+        - --host=0.0.0.0
+        - --port=8000{{ else if eq .Values.weight "4b" }}
+      - name: inference-server
+        image: "{{ .Values.image }}"
+        resources:
+          requests:
+            cpu: "2"
+            memory: "20Gi"
+            ephemeral-storage: "20Gi"
+            nvidia.com/gpu: "1"
+          limits:
+            cpu: "2"
+            memory: "20Gi"
+            ephemeral-storage: "20Gi"
+            nvidia.com/gpu: "1"
+        command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+        args:
+        - --model=$(MODEL_ID)
+        - --tensor-parallel-size=1
+        - --host=0.0.0.0
+        - --port=8000
+        - --max-model-len=32768
+        - --max-num-seqs=4{{- else if eq .Values.weight "12b" }}
+      - name: inference-server
+        image: "{{ .Values.image }}"
+        resources:
+          requests:
+            cpu: "4"
+            memory: "32Gi"
+            ephemeral-storage: "32Gi"
+            nvidia.com/gpu: "2"
+          limits:
+            cpu: "4"
+            memory: "32Gi"
+            ephemeral-storage: "32Gi"
+            nvidia.com/gpu: "2"
+        command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+        args:
+        - --model=$(MODEL_ID)
+        - --tensor-parallel-size=2
+        - --host=0.0.0.0
+        - --port=8000
+        - --max-model-len=16384
+        - --max-num-seqs=4{{ else if eq .Values.weight "27b" }}
+      - name: inference-server
+        image: "{{ .Values.image }}"
+        resources:
+          requests:
+            cpu: "10"
+            memory: "128Gi"
+            ephemeral-storage: "120Gi"
+            nvidia.com/gpu : "1"
+          limits:
+            cpu: "10"
+            memory: "128Gi"
+            ephemeral-storage: "120Gi"
+            nvidia.com/gpu : "1"
         command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
         args:
         - --model=$(MODEL_ID)
         - --tensor-parallel-size=1
         - --host=0.0.0.0
         - --port=8000
+        - --swap-space=16
+        - --gpu-memory-utilization=0.95
+        - --max-model-len=32768
+        - --max-num-seqs=4{{ end }}
         env:
         - name: MODEL_ID
           value: "google/gemma-3-{{ .Values.weight }}-it"
diff --git a/fleet-charts/gemma-server/values.schema.json b/fleet-charts/gemma-server/values.schema.json
@@ -4,10 +4,41 @@
     "hf_api_token": {
       "description": "Hugging Face token",
       "type": "string"
+    },
+    "weight": {
+      "description": "Gemma 3 Weight",
+      "type": "string",
+      "enum": [
+        "1b",
+        "4b",
+        "12b",
+        "27b"
+      ]
+    },
+    "image": {
+      "description": "VLLM Image",
+      "type": "string"
+    },
+    "accelerator": {
+      "description": "GPU Accelerator Type",
+      "type": "string"
+    },
+    "minReplicas": {
+      "description": "Minimum Replicas",
+      "type": "integer"
+    },
+    "maxReplicas": {
+      "description": "Maximum Replicas",
+      "type": "integer"
     }
   },
   "required": [
-    "hf_api_token"
+    "hf_api_token",
+    "weight",
+    "image",
+    "accelerator",
+    "minReplicas",
+    "maxReplicas"
   ],
   "title": "Values",
   "type": "object"
diff --git a/fleet-charts/gemma-server/values.yaml b/fleet-charts/gemma-server/values.yaml
@@ -12,13 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Patterned on https://raw.githubusercontent.com/GoogleCloudPlatform/kubernetes-engine-samples/c70138cd0a441bb168307000dfb4c698918c15a7/ai-ml/llm-serving-gemma/vllm/vllm-3-1b-it.yaml
-image: "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20250312_0916_RC01"
+# Gemma 3 weights: 1b, 4b, 12b, 27b
 weight: "1b"
-cpu: "2"
-memory: "10Gi"
-storage: "10Gi"
-gpu: "1"
-accelerator: nvidia-l4
+
+image: "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20250312_0916_RC01"
+accelerator: "nvidia-l4"
 minReplicas: 1
 maxReplicas: 3
diff --git a/fleet-charts/hello-world/Chart.yaml b/fleet-charts/hello-world/Chart.yaml
@@ -14,7 +14,7 @@
 
 apiVersion: v2
 name: hello-world
-version: 0.0.1
+version: 0.0.2
 description: "Hello World sample Deployment with HorizontalPodAutoscaler, Service, and ServiceExport"
 type: application
 appVersion: v1.0