Environment details
- Programming language: Python
- OS: macOS, Apple Silicon
- Language runtime version: Python 3.12.12
- Package version: google-genai 2.8.0 from local
python-genai checkout
Steps to reproduce
- Install dependencies:
python -m pip install google-genai sounddevice
export GOOGLE_API_KEY="..."
- Save and run this script:
Full runnable repro script
from __future__ import annotations
import asyncio
import os
import re
import sys
from google import genai
from google.genai import types
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY")
INPUT_RATE = 16000
OUTPUT_RATE = 24000
SILENCE_DURATION_MS = 10_000
def parse_audio_rate(mime_type: str | None, fallback: int) -> int:
if not mime_type:
return fallback
match = re.search(r"rate=(\d+)", mime_type)
return int(match.group(1)) if match else fallback
async def play_pcm_audio(audio: bytes, rate: int) -> None:
import sounddevice as sd
def write_audio() -> None:
with sd.RawOutputStream(samplerate=rate, channels=1, dtype="int16") as stream:
stream.write(audio)
await asyncio.to_thread(write_audio)
async def microphone_pcm_chunks(rate: int = INPUT_RATE):
import sounddevice as sd
loop = asyncio.get_running_loop()
queue: asyncio.Queue[bytes] = asyncio.Queue(maxsize=50)
def callback(indata, frames, time_info, status) -> None:
if status:
print(f"input status: {status}", file=sys.stderr)
chunk = bytes(indata)
try:
loop.call_soon_threadsafe(queue.put_nowait, chunk)
except asyncio.QueueFull:
pass
with sd.RawInputStream(
samplerate=rate,
channels=1,
dtype="int16",
blocksize=int(rate * 0.1),
callback=callback,
):
while True:
yield await queue.get()
async def receive_responses(session) -> None:
while True:
audio_chunks: list[bytes] = []
audio_rate = OUTPUT_RATE
async for msg in session.receive():
if msg.server_content and msg.server_content.model_turn:
for part in msg.server_content.model_turn.parts or []:
if part.inline_data and part.inline_data.data:
audio_chunks.append(part.inline_data.data)
audio_rate = parse_audio_rate(part.inline_data.mime_type,
audio_rate)
if msg.server_content and msg.server_content.turn_complete:
if audio_chunks:
await play_pcm_audio(b"".join(audio_chunks), audio_rate)
break
async def main() -> None:
if not GOOGLE_API_KEY:
raise RuntimeError("Set GOOGLE_API_KEY or GEMINI_API_KEY.")
client = genai.Client(api_key=GOOGLE_API_KEY)
config = types.LiveConnectConfig(
response_modalities=["AUDIO"],
realtime_input_config=types.RealtimeInputConfig(
automatic_activity_detection=types.AutomaticActivityDetection(
end_of_speech_sensitivity=types.EndSensitivity.END_SENSITIVITY_LOW,
silence_duration_ms=SILENCE_DURATION_MS,
)
),
)
print(f"Connecting to gemini-3.1-flash-live-preview with silence_duration_ms={SILENCE_DURATION_MS}")
print(f"Speak, then stop. Expected: about {SILENCE_DURATION_MS/1000} seconds before Gemini responds.")
print("Press Ctrl+C to exit.")
async with client.aio.live.connect(
model="gemini-3.1-flash-live-preview",
config=config,
) as session:
receive_task = asyncio.create_task(receive_responses(session))
async for pcm_chunk in microphone_pcm_chunks():
await session.send_realtime_input(
audio=types.Blob(data=pcm_chunk, mime_type="audio/pcm;rate=16000")
)
await receive_task
if __name__ == "__main__":
asyncio.run(main())
- Speak a short utterance and then stop speaking.
Expected behavior
With SILENCE_DURATION_MS = 10_000, Gemini Live automatic activity detection should wait for
roughly 10 seconds of detected non-speech before committing end-of-speech and starting the
model response.
Actual behavior
Gemini Live responds almost immediately, or much sooner than 10_000 ms, after speech stops.
Environment details
python-genaicheckoutSteps to reproduce
Full runnable repro script
Expected behavior
With
SILENCE_DURATION_MS = 10_000, Gemini Live automatic activity detection should wait forroughly 10 seconds of detected non-speech before committing end-of-speech and starting the
model response.
Actual behavior
Gemini Live responds almost immediately, or much sooner than 10_000 ms, after speech stops.