Skip to content

Commit 171171c

Browse files
committed
fix: open ai llm prewarm
1 parent 491bee2 commit 171171c

File tree

2 files changed

+186
-1
lines changed

2 files changed

+186
-1
lines changed

livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from __future__ import annotations
1616

17+
import asyncio
1718
import os
1819
from dataclasses import asdict, dataclass
1920
from typing import Any, Literal
@@ -22,7 +23,7 @@
2223
import httpx
2324

2425
import openai
25-
from livekit.agents import llm
26+
from livekit.agents import llm, utils
2627
from livekit.agents.inference.llm import LLMStream as _LLMStream
2728
from livekit.agents.llm import ToolChoice, utils as llm_utils
2829
from livekit.agents.llm.chat_context import ChatContext
@@ -157,6 +158,7 @@ def __init__(
157158
),
158159
),
159160
)
161+
self._prewarm_task: asyncio.Task[None] | None = None
160162

161163
@property
162164
def model(self) -> str:
@@ -912,6 +914,26 @@ def chat(
912914
extra_kwargs=extra,
913915
)
914916

917+
def prewarm(self) -> None:
918+
"""Pre-warm the HTTP connection pool to reduce first-request latency."""
919+
920+
async def _prewarm_impl() -> None:
921+
try:
922+
await self._client.get("/", cast_to=str)
923+
except Exception:
924+
pass
925+
926+
# Cancel any existing prewarm task before creating a new one
927+
if self._prewarm_task is not None and not self._prewarm_task.done():
928+
self._prewarm_task.cancel()
929+
930+
self._prewarm_task = asyncio.create_task(_prewarm_impl())
931+
932+
async def aclose(self) -> None:
933+
"""Clean up resources including any pending prewarm tasks."""
934+
if self._prewarm_task is not None:
935+
await utils.aio.gracefully_cancel(self._prewarm_task)
936+
915937

916938
class LLMStream(_LLMStream):
917939
def __init__(

tests/test_llm_prewarm.py

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
"""
2+
Test LLM prewarming functionality (Issue #3240).
3+
4+
This test suite verifies that the prewarm() method reduces first-request latency
5+
by pre-establishing HTTP connections to the LLM service.
6+
"""
7+
8+
from __future__ import annotations
9+
10+
import asyncio
11+
import os
12+
import time
13+
14+
import pytest
15+
16+
from livekit.agents import llm
17+
from livekit.plugins import openai
18+
19+
pytestmark = pytest.mark.skipif(
20+
not os.environ.get("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set"
21+
)
22+
23+
llm_model = "gpt-4o-mini"
24+
25+
26+
@pytest.mark.asyncio
27+
async def test_llm_prewarm_reduces_latency():
28+
"""Test that prewarming reduces time to first token (TTFT).
29+
This test verifies that calling prewarm() before making an LLM request
30+
reduces the latency of the first request by pre-establishing the HTTP connection.
31+
"""
32+
# Test 1: WITHOUT prewarming
33+
llm_no_prewarm = openai.LLM(model=llm_model)
34+
35+
chat_ctx = llm.ChatContext()
36+
chat_ctx.add_message(role="user", content="Say 'test' in one word only")
37+
38+
start = time.perf_counter()
39+
stream = llm_no_prewarm.chat(chat_ctx=chat_ctx)
40+
41+
# Measure time to first chunk
42+
ttft_no_prewarm = 0
43+
async for chunk in stream:
44+
if chunk.delta and chunk.delta.content:
45+
ttft_no_prewarm = time.perf_counter() - start
46+
break
47+
48+
# Fully consume the stream to avoid leaks
49+
async for _ in stream:
50+
pass
51+
52+
await llm_no_prewarm.aclose()
53+
54+
# Test 2: WITH prewarming
55+
llm_with_prewarm = openai.LLM(model=llm_model)
56+
llm_with_prewarm.prewarm()
57+
58+
# Give the prewarm task a moment to establish the connection
59+
await asyncio.sleep(0.3)
60+
61+
chat_ctx = llm.ChatContext()
62+
chat_ctx.add_message(role="user", content="Say 'test' in one word only")
63+
64+
start = time.perf_counter()
65+
stream = llm_with_prewarm.chat(chat_ctx=chat_ctx)
66+
67+
# Measure time to first chunk
68+
ttft_with_prewarm = 0
69+
async for chunk in stream:
70+
if chunk.delta and chunk.delta.content:
71+
ttft_with_prewarm = time.perf_counter() - start
72+
break
73+
74+
# Fully consume the stream to avoid leaks
75+
async for _ in stream:
76+
pass
77+
78+
await llm_with_prewarm.aclose()
79+
80+
# Verify prewarming helped (should be at least slightly faster)
81+
# We don't assert a specific improvement because network conditions vary,
82+
# but we print the results for visibility
83+
print("Prewarm Test Results:")
84+
print(f" Without prewarm: {ttft_no_prewarm:.3f}s")
85+
print(f" With prewarm: {ttft_with_prewarm:.3f}s")
86+
87+
if ttft_with_prewarm < ttft_no_prewarm:
88+
improvement = ttft_no_prewarm - ttft_with_prewarm
89+
improvement_pct = (improvement / ttft_no_prewarm) * 100
90+
print(f"Improvement: {improvement:.3f}s ({improvement_pct:.1f}% faster)")
91+
else:
92+
print(" No improvement detected (network conditions may vary)")
93+
94+
# The test passes if both requests succeeded
95+
# We don't strictly assert latency improvements due to network variability
96+
assert ttft_no_prewarm > 0
97+
assert ttft_with_prewarm > 0
98+
99+
100+
@pytest.mark.asyncio
101+
async def test_llm_prewarm_task_cleanup():
102+
"""Test that prewarm task is properly cleaned up on aclose()."""
103+
llm_instance = openai.LLM(model=llm_model)
104+
105+
# Start prewarming
106+
llm_instance.prewarm()
107+
108+
# Verify task was created
109+
assert llm_instance._prewarm_task is not None
110+
111+
# Close immediately (should cancel the prewarm task gracefully)
112+
await llm_instance.aclose()
113+
114+
# Task should be completed or cancelled
115+
assert llm_instance._prewarm_task.done() or llm_instance._prewarm_task.cancelled()
116+
117+
118+
@pytest.mark.asyncio
119+
async def test_llm_prewarm_idempotent():
120+
"""Test that calling prewarm() multiple times doesn't cause issues."""
121+
llm_instance = openai.LLM(model=llm_model)
122+
123+
# Call prewarm multiple times
124+
llm_instance.prewarm()
125+
first_task = llm_instance._prewarm_task
126+
127+
# Calling prewarm again should create a new task
128+
llm_instance.prewarm()
129+
second_task = llm_instance._prewarm_task
130+
131+
# Both tasks should exist
132+
assert first_task is not None
133+
assert second_task is not None
134+
135+
# Clean up - must wait for tasks to complete or aclose will leak
136+
await llm_instance.aclose()
137+
138+
139+
@pytest.mark.asyncio
140+
async def test_llm_works_without_prewarm():
141+
"""Test that LLM works normally even without calling prewarm()."""
142+
llm_instance = openai.LLM(model=llm_model)
143+
144+
# Don't call prewarm() at all
145+
chat_ctx = llm.ChatContext()
146+
chat_ctx.add_message(role="user", content="Say 'hello' in one word")
147+
148+
stream = llm_instance.chat(chat_ctx=chat_ctx)
149+
150+
# Should still work fine
151+
response_received = False
152+
async for chunk in stream:
153+
if chunk.delta and chunk.delta.content:
154+
response_received = True
155+
break
156+
157+
# Fully consume the stream to avoid leaks
158+
async for _ in stream:
159+
pass
160+
161+
await llm_instance.aclose()
162+
163+
assert response_received, "Should receive response even without prewarm"

0 commit comments

Comments
 (0)