Skip to content

Commit 2909d8e

Browse files
committed
fix: open ai llm prewarm
1 parent 491bee2 commit 2909d8e

File tree

2 files changed

+185
-1
lines changed

2 files changed

+185
-1
lines changed

livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from __future__ import annotations
1616

17+
import asyncio
1718
import os
1819
from dataclasses import asdict, dataclass
1920
from typing import Any, Literal
@@ -22,7 +23,7 @@
2223
import httpx
2324

2425
import openai
25-
from livekit.agents import llm
26+
from livekit.agents import llm, utils
2627
from livekit.agents.inference.llm import LLMStream as _LLMStream
2728
from livekit.agents.llm import ToolChoice, utils as llm_utils
2829
from livekit.agents.llm.chat_context import ChatContext
@@ -157,6 +158,7 @@ def __init__(
157158
),
158159
),
159160
)
161+
self._prewarm_task: asyncio.Task[None] | None = None
160162

161163
@property
162164
def model(self) -> str:
@@ -912,6 +914,26 @@ def chat(
912914
extra_kwargs=extra,
913915
)
914916

917+
def prewarm(self) -> None:
918+
"""Pre-warm the HTTP connection pool to reduce first-request latency."""
919+
920+
async def _prewarm_impl() -> None:
921+
try:
922+
await self._client.get("/", cast_to=str)
923+
except Exception:
924+
pass
925+
926+
# Cancel any existing prewarm task before creating a new one
927+
if self._prewarm_task is not None and not self._prewarm_task.done():
928+
self._prewarm_task.cancel()
929+
930+
self._prewarm_task = asyncio.create_task(_prewarm_impl())
931+
932+
async def aclose(self) -> None:
933+
"""Clean up resources including any pending prewarm tasks."""
934+
if self._prewarm_task is not None:
935+
await utils.aio.gracefully_cancel(self._prewarm_task)
936+
915937

916938
class LLMStream(_LLMStream):
917939
def __init__(

tests/test_llm_prewarm.py

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
"""
2+
Test LLM prewarming functionality (Issue #3240).
3+
4+
This test suite verifies that the prewarm() method reduces first-request latency
5+
by pre-establishing HTTP connections to the LLM service.
6+
"""
7+
8+
from __future__ import annotations
9+
10+
import asyncio
11+
import os
12+
import time
13+
14+
import pytest
15+
16+
from livekit.agents import llm
17+
from livekit.plugins import openai
18+
19+
pytestmark = pytest.mark.skipif(
20+
not os.environ.get("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set"
21+
)
22+
23+
llm_model ="gpt-4o-mini"
24+
25+
@pytest.mark.asyncio
26+
async def test_llm_prewarm_reduces_latency():
27+
"""Test that prewarming reduces time to first token (TTFT).
28+
This test verifies that calling prewarm() before making an LLM request
29+
reduces the latency of the first request by pre-establishing the HTTP connection.
30+
"""
31+
# Test 1: WITHOUT prewarming
32+
llm_no_prewarm = openai.LLM(model=llm_model)
33+
34+
chat_ctx = llm.ChatContext()
35+
chat_ctx.add_message(role="user", content="Say 'test' in one word only")
36+
37+
start = time.perf_counter()
38+
stream = llm_no_prewarm.chat(chat_ctx=chat_ctx)
39+
40+
# Measure time to first chunk
41+
ttft_no_prewarm = 0
42+
async for chunk in stream:
43+
if chunk.delta and chunk.delta.content:
44+
ttft_no_prewarm = time.perf_counter() - start
45+
break
46+
47+
# Fully consume the stream to avoid leaks
48+
async for _ in stream:
49+
pass
50+
51+
await llm_no_prewarm.aclose()
52+
53+
# Test 2: WITH prewarming
54+
llm_with_prewarm = openai.LLM(model=llm_model)
55+
llm_with_prewarm.prewarm()
56+
57+
# Give the prewarm task a moment to establish the connection
58+
await asyncio.sleep(0.3)
59+
60+
chat_ctx = llm.ChatContext()
61+
chat_ctx.add_message(role="user", content="Say 'test' in one word only")
62+
63+
start = time.perf_counter()
64+
stream = llm_with_prewarm.chat(chat_ctx=chat_ctx)
65+
66+
# Measure time to first chunk
67+
ttft_with_prewarm = 0
68+
async for chunk in stream:
69+
if chunk.delta and chunk.delta.content:
70+
ttft_with_prewarm = time.perf_counter() - start
71+
break
72+
73+
# Fully consume the stream to avoid leaks
74+
async for _ in stream:
75+
pass
76+
77+
await llm_with_prewarm.aclose()
78+
79+
# Verify prewarming helped (should be at least slightly faster)
80+
# We don't assert a specific improvement because network conditions vary,
81+
# but we print the results for visibility
82+
print("Prewarm Test Results:")
83+
print(f" Without prewarm: {ttft_no_prewarm:.3f}s")
84+
print(f" With prewarm: {ttft_with_prewarm:.3f}s")
85+
86+
if ttft_with_prewarm < ttft_no_prewarm:
87+
improvement = ttft_no_prewarm - ttft_with_prewarm
88+
improvement_pct = (improvement / ttft_no_prewarm) * 100
89+
print(f"Improvement: {improvement:.3f}s ({improvement_pct:.1f}% faster)")
90+
else:
91+
print(" No improvement detected (network conditions may vary)")
92+
93+
# The test passes if both requests succeeded
94+
# We don't strictly assert latency improvements due to network variability
95+
assert ttft_no_prewarm > 0
96+
assert ttft_with_prewarm > 0
97+
98+
99+
@pytest.mark.asyncio
100+
async def test_llm_prewarm_task_cleanup():
101+
"""Test that prewarm task is properly cleaned up on aclose()."""
102+
llm_instance = openai.LLM(model=llm_model)
103+
104+
# Start prewarming
105+
llm_instance.prewarm()
106+
107+
# Verify task was created
108+
assert llm_instance._prewarm_task is not None
109+
110+
# Close immediately (should cancel the prewarm task gracefully)
111+
await llm_instance.aclose()
112+
113+
# Task should be completed or cancelled
114+
assert llm_instance._prewarm_task.done() or llm_instance._prewarm_task.cancelled()
115+
116+
117+
@pytest.mark.asyncio
118+
async def test_llm_prewarm_idempotent():
119+
"""Test that calling prewarm() multiple times doesn't cause issues."""
120+
llm_instance = openai.LLM(model=llm_model)
121+
122+
# Call prewarm multiple times
123+
llm_instance.prewarm()
124+
first_task = llm_instance._prewarm_task
125+
126+
# Calling prewarm again should create a new task
127+
llm_instance.prewarm()
128+
second_task = llm_instance._prewarm_task
129+
130+
# Both tasks should exist
131+
assert first_task is not None
132+
assert second_task is not None
133+
134+
# Clean up - must wait for tasks to complete or aclose will leak
135+
await llm_instance.aclose()
136+
137+
138+
@pytest.mark.asyncio
139+
async def test_llm_works_without_prewarm():
140+
"""Test that LLM works normally even without calling prewarm()."""
141+
llm_instance = openai.LLM(model=llm_model)
142+
143+
# Don't call prewarm() at all
144+
chat_ctx = llm.ChatContext()
145+
chat_ctx.add_message(role="user", content="Say 'hello' in one word")
146+
147+
stream = llm_instance.chat(chat_ctx=chat_ctx)
148+
149+
# Should still work fine
150+
response_received = False
151+
async for chunk in stream:
152+
if chunk.delta and chunk.delta.content:
153+
response_received = True
154+
break
155+
156+
# Fully consume the stream to avoid leaks
157+
async for _ in stream:
158+
pass
159+
160+
await llm_instance.aclose()
161+
162+
assert response_received, "Should receive response even without prewarm"

0 commit comments

Comments
 (0)