Skip to content

Commit 893f6d4

Browse files
authored
feat(gemma-server): add enums for weights (#19)
1 parent 89a61c3 commit 893f6d4

File tree

6 files changed

+148
-20
lines changed

6 files changed

+148
-20
lines changed

.github/workflows/helm-charts.yaml

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
name: Helm Charts
2+
3+
on:
4+
pull_request:
5+
paths:
6+
- 'fleet-charts/**'
7+
8+
jobs:
9+
lint-test:
10+
runs-on: ubuntu-latest
11+
steps:
12+
- name: Checkout
13+
uses: actions/checkout@v3
14+
with:
15+
fetch-depth: 0
16+
17+
- name: Set up Helm
18+
uses: azure/[email protected]
19+
with:
20+
version: v3.17.0
21+
22+
- uses: actions/[email protected]
23+
with:
24+
python-version: '3.x'
25+
check-latest: true
26+
27+
- name: Set up chart-testing
28+
uses: helm/[email protected]
29+
30+
- name: Run chart-linting
31+
run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs fleet-charts --validate-maintainers=false --helm-lint-extra-args '--set hf_api_token=NONE'

fleet-charts/gemma-server/Chart.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,6 @@
1414

1515
apiVersion: v2
1616
name: gemma-server
17-
version: 0.0.1
17+
version: 0.0.2
1818
description: Serve Gemma 3 open models using GPUs on GKE with vLLM
1919
type: application

fleet-charts/gemma-server/templates/deployment.yaml

+79-10
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,11 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
# Patterned on https://raw.githubusercontent.com/GoogleCloudPlatform/kubernetes-engine-samples/c70138cd0a441bb168307000dfb4c698918c15a7/ai-ml/llm-serving-gemma/vllm/vllm-3-1b-it.yaml
15+
# Patterned on:
16+
# - https://raw.githubusercontent.com/GoogleCloudPlatform/kubernetes-engine-samples/c70138cd0a441bb168307000dfb4c698918c15a7/ai-ml/llm-serving-gemma/vllm/vllm-3-1b-it.yaml
17+
# - https://raw.githubusercontent.com/GoogleCloudPlatform/kubernetes-engine-samples/c70138cd0a441bb168307000dfb4c698918c15a7/ai-ml/llm-serving-gemma/vllm/vllm-3-4b-it.yaml
18+
# - https://raw.githubusercontent.com/GoogleCloudPlatform/kubernetes-engine-samples/c70138cd0a441bb168307000dfb4c698918c15a7/ai-ml/llm-serving-gemma/vllm/vllm-3-14b-it.yaml
19+
# - https://raw.githubusercontent.com/GoogleCloudPlatform/kubernetes-engine-samples/c70138cd0a441bb168307000dfb4c698918c15a7/ai-ml/llm-serving-gemma/vllm/vllm-3-27b-it.yaml
1620

1721
apiVersion: apps/v1
1822
kind: Deployment
@@ -32,26 +36,91 @@ spec:
3236
ai.gke.io/model: "gemma-3-{{ .Values.weight }}-it"
3337
ai.gke.io/inference-server: vllm
3438
spec:
35-
containers:
39+
containers: {{ if eq .Values.weight "1b" }}
3640
- name: inference-server
3741
image: "{{ .Values.image }}"
3842
resources:
3943
requests:
40-
cpu: "{{ .Values.cpu }}"
41-
memory: "{{ .Values.memory }}"
42-
ephemeral-storage: "{{ .Values.storage }}"
43-
nvidia.com/gpu: "{{ .Values.gpu }}"
44+
cpu: "2"
45+
memory: "10Gi"
46+
ephemeral-storage: "10Gi"
47+
nvidia.com/gpu: "1"
4448
limits:
45-
cpu: "{{ .Values.cpu }}"
46-
memory: "{{ .Values.memory }}"
47-
ephemeral-storage: "{{ .Values.storage }}"
48-
nvidia.com/gpu: "{{ .Values.gpu }}"
49+
cpu: "2"
50+
memory: "10Gi"
51+
ephemeral-storage: "10Gi"
52+
nvidia.com/gpu: "1"
53+
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
54+
args:
55+
- --model=$(MODEL_ID)
56+
- --tensor-parallel-size=1
57+
- --host=0.0.0.0
58+
- --port=8000{{ else if eq .Values.weight "4b" }}
59+
- name: inference-server
60+
image: "{{ .Values.image }}"
61+
resources:
62+
requests:
63+
cpu: "2"
64+
memory: "20Gi"
65+
ephemeral-storage: "20Gi"
66+
nvidia.com/gpu: "1"
67+
limits:
68+
cpu: "2"
69+
memory: "20Gi"
70+
ephemeral-storage: "20Gi"
71+
nvidia.com/gpu: "1"
72+
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
73+
args:
74+
- --model=$(MODEL_ID)
75+
- --tensor-parallel-size=1
76+
- --host=0.0.0.0
77+
- --port=8000
78+
- --max-model-len=32768
79+
- --max-num-seqs=4{{- else if eq .Values.weight "12b" }}
80+
- name: inference-server
81+
image: "{{ .Values.image }}"
82+
resources:
83+
requests:
84+
cpu: "4"
85+
memory: "32Gi"
86+
ephemeral-storage: "32Gi"
87+
nvidia.com/gpu: "2"
88+
limits:
89+
cpu: "4"
90+
memory: "32Gi"
91+
ephemeral-storage: "32Gi"
92+
nvidia.com/gpu: "2"
93+
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
94+
args:
95+
- --model=$(MODEL_ID)
96+
- --tensor-parallel-size=2
97+
- --host=0.0.0.0
98+
- --port=8000
99+
- --max-model-len=16384
100+
- --max-num-seqs=4{{ else if eq .Values.weight "27b" }}
101+
- name: inference-server
102+
image: "{{ .Values.image }}"
103+
resources:
104+
requests:
105+
cpu: "10"
106+
memory: "128Gi"
107+
ephemeral-storage: "120Gi"
108+
nvidia.com/gpu : "1"
109+
limits:
110+
cpu: "10"
111+
memory: "128Gi"
112+
ephemeral-storage: "120Gi"
113+
nvidia.com/gpu : "1"
49114
command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
50115
args:
51116
- --model=$(MODEL_ID)
52117
- --tensor-parallel-size=1
53118
- --host=0.0.0.0
54119
- --port=8000
120+
- --swap-space=16
121+
- --gpu-memory-utilization=0.95
122+
- --max-model-len=32768
123+
- --max-num-seqs=4{{ end }}
55124
env:
56125
- name: MODEL_ID
57126
value: "google/gemma-3-{{ .Values.weight }}-it"

fleet-charts/gemma-server/values.schema.json

+32-1
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,41 @@
44
"hf_api_token": {
55
"description": "Hugging Face token",
66
"type": "string"
7+
},
8+
"weight": {
9+
"description": "Gemma 3 Weight",
10+
"type": "string",
11+
"enum": [
12+
"1b",
13+
"4b",
14+
"12b",
15+
"27b"
16+
]
17+
},
18+
"image": {
19+
"description": "VLLM Image",
20+
"type": "string"
21+
},
22+
"accelerator": {
23+
"description": "GPU Accelerator Type",
24+
"type": "string"
25+
},
26+
"minReplicas": {
27+
"description": "Minimum Replicas",
28+
"type": "integer"
29+
},
30+
"maxReplicas": {
31+
"description": "Maximum Replicas",
32+
"type": "integer"
733
}
834
},
935
"required": [
10-
"hf_api_token"
36+
"hf_api_token",
37+
"weight",
38+
"image",
39+
"accelerator",
40+
"minReplicas",
41+
"maxReplicas"
1142
],
1243
"title": "Values",
1344
"type": "object"

fleet-charts/gemma-server/values.yaml

+4-7
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,10 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
# Patterned on https://raw.githubusercontent.com/GoogleCloudPlatform/kubernetes-engine-samples/c70138cd0a441bb168307000dfb4c698918c15a7/ai-ml/llm-serving-gemma/vllm/vllm-3-1b-it.yaml
16-
image: "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20250312_0916_RC01"
15+
# Gemma 3 weights: 1b, 4b, 12b, 27b
1716
weight: "1b"
18-
cpu: "2"
19-
memory: "10Gi"
20-
storage: "10Gi"
21-
gpu: "1"
22-
accelerator: nvidia-l4
17+
18+
image: "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20250312_0916_RC01"
19+
accelerator: "nvidia-l4"
2320
minReplicas: 1
2421
maxReplicas: 3

fleet-charts/hello-world/Chart.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
apiVersion: v2
1616
name: hello-world
17-
version: 0.0.1
17+
version: 0.0.2
1818
description: "Hello World sample Deployment with HorizontalPodAutoscaler, Service, and ServiceExport"
1919
type: application
2020
appVersion: v1.0

0 commit comments

Comments
 (0)