@@ -161,52 +161,38 @@ export HF_TOKEN="your-token"
161161
162162The inference-gateway serves as the HTTP ingress point for all inference requests in our deployment.
163163It’s implemented as a Kubernetes Gateway (` gateway.networking.k8s.io/v1 ` ) using whichever ` gatewayClassName ` you’ve
164- chosen, either ` kgateway ` or ` istio ` and sits in front of your inference pods to handle path-based routing, load-balancing,
164+ chosen, either ` istio ` or ` kgateway ` and sits in front of your inference pods to handle path-based routing, load-balancing,
165165retries, and metrics. All calls to ` /v1/models ` and ` /v1/completions ` flow through this gateway to the appropriate
166166` decode ` or ` prefill ` services.
167167
168- ``` bash
169- # -------------------------------------------------------------------------
170- # Option A: Direct NodePort (no minikube tunnel required)
171- # -------------------------------------------------------------------------
172- # 1) Grab the Minikube VM IP and the NodePort that the gateway is listening on
173- MINIKUBE_IP=$( minikube ip)
174- NODEPORT=$( kubectl get svc llm-d-inference-gateway -n llm-d -o jsonpath=' {.spec.ports[0].nodePort}' )
175- MODEL_ID=< INSERT_MODEL_NAME e.g. meta-llama/Llama-3.2-3B-Instruct, Qwen/Qwen3-0.6B, etc>
176- # List the serving model, if unsure what the model id is
177- curl -s http://$MINIKUBE_IP :$NODEPORT /v1/models
178- # 2) Curl the same completion endpoint on that high-numbered port:
179- curl -X POST http://$MINIKUBE_IP :$NODEPORT /v1/completions \
180- -H ' accept: application/json' \
181- -H ' Content-Type: application/json' \
182- -d ' {
183- "model": "' " $MODEL_ID " ' ",
184- "prompt": "You are a helpful AI assistant. Please introduce yourself in one sentence."
185- }'
168+ #### Step 1: Port-forward the llm-d-inference-gateway service
186169
170+ Open a terminal and run:
187171
188- # -------------------------------------------------------------------------
189- # Option B: LoadBalancer + minikube tunnel
190- # -------------------------------------------------------------------------
191- # 1) Before minkube tunnel is run: EXTERNAL-IP is still <pending>
192- kubectl get svc -n llm-d | grep llm-d-inference-gateway
193- # ➜ llm-d-inference-gateway LoadBalancer 10.109.40.169 <pending> 80:30185/TCP
172+ ``` bash
173+ if kubectl get svc -n llm-d llm-d-inference-gateway-istio & > /dev/null; then
174+ kubectl port-forward -n llm-d svc/llm-d-inference-gateway-istio 3000:80 # port forward istio gateway
175+ else
176+ kubectl port-forward -n llm-d svc/llm-d-inference-gateway 3000:80 # port forward kgateway gateway
177+ fi
178+ ```
194179
195- # 2) In a separate terminal, start the tunnel (grants a host-reachable VIP)
196- minikube tunnel
180+ #### Step 2: Test the Inference Gateway with curl
197181
198- # 3) After minikube tunnel is run: EXTERNAL-IP flips to the real address
199- kubectl get svc -n llm-d | grep llm-d-inference-gateway
200- # ➜ llm-d-inference-gateway LoadBalancer 10.109.40.169 10.109.40.169 80:30185/TCP
182+ In a new terminal, use ` curl ` to interact with the inference gateway:
201183
202- # 4) Hit the gateway’s plain completion endpoint with a role-based prompt:
203- MODEL_ID=meta-llama/Llama-3.2-3B-Instruct
204- curl -X POST http://10.109.40.169/v1/completions \
184+ ``` bash
185+ # 1) List the available models and get the model ID
186+ curl -s http://127.0.0.1:3000/v1/models
187+ MODEL_ID=< INSERT_MODEL_NAME e.g. meta-llama/Llama-3.2-3B-Instruct, Qwen/Qwen3-0.6B, etc>
188+
189+ # 2) Send a completion request to the model
190+ curl -X POST http://127.0.0.1:3000/v1/completions \
205191 -H ' accept: application/json' \
206192 -H ' Content-Type: application/json' \
207193 -d ' {
208194 "model": "' " $MODEL_ID " ' ",
209- "prompt": "You are a helpful AI assistant. Please introduce yourself in one sentence.",
195+ "prompt": "You are a helpful AI assistant. Please introduce yourself in one sentence."
210196 }'
211197```
212198
0 commit comments