Skip to content

gemini-3.1-flash-live-preview ignores automatic_activity_detection.silence_duration_ms #2580

@cyh0530

Description

@cyh0530

Environment details

  • Programming language: Python
  • OS: macOS, Apple Silicon
  • Language runtime version: Python 3.12.12
  • Package version: google-genai 2.8.0 from local python-genai checkout

Steps to reproduce

  1. Install dependencies:
python -m pip install google-genai sounddevice
export GOOGLE_API_KEY="..."
  1. Save and run this script:
Full runnable repro script
from __future__ import annotations
import asyncio
import os
import re
import sys

from google import genai
from google.genai import types


GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY")
INPUT_RATE = 16000
OUTPUT_RATE = 24000
SILENCE_DURATION_MS = 10_000

def parse_audio_rate(mime_type: str | None, fallback: int) -> int:
   if not mime_type:
       return fallback
   match = re.search(r"rate=(\d+)", mime_type)
   return int(match.group(1)) if match else fallback


async def play_pcm_audio(audio: bytes, rate: int) -> None:
   import sounddevice as sd

   def write_audio() -> None:
       with sd.RawOutputStream(samplerate=rate, channels=1, dtype="int16") as stream:
           stream.write(audio)

   await asyncio.to_thread(write_audio)


async def microphone_pcm_chunks(rate: int = INPUT_RATE):
   import sounddevice as sd

   loop = asyncio.get_running_loop()
   queue: asyncio.Queue[bytes] = asyncio.Queue(maxsize=50)

   def callback(indata, frames, time_info, status) -> None:
       if status:
           print(f"input status: {status}", file=sys.stderr)
       chunk = bytes(indata)
       try:
           loop.call_soon_threadsafe(queue.put_nowait, chunk)
       except asyncio.QueueFull:
           pass

   with sd.RawInputStream(
       samplerate=rate,
       channels=1,
       dtype="int16",
       blocksize=int(rate * 0.1),
       callback=callback,
   ):
       while True:
           yield await queue.get()


async def receive_responses(session) -> None:
   while True:
       audio_chunks: list[bytes] = []
       audio_rate = OUTPUT_RATE

       async for msg in session.receive():
           if msg.server_content and msg.server_content.model_turn:
               for part in msg.server_content.model_turn.parts or []:
                   if part.inline_data and part.inline_data.data:
                       audio_chunks.append(part.inline_data.data)
                       audio_rate = parse_audio_rate(part.inline_data.mime_type,
                       audio_rate)

           if msg.server_content and msg.server_content.turn_complete:
               if audio_chunks:
                   await play_pcm_audio(b"".join(audio_chunks), audio_rate)
               break


async def main() -> None:
   if not GOOGLE_API_KEY:
       raise RuntimeError("Set GOOGLE_API_KEY or GEMINI_API_KEY.")

   client = genai.Client(api_key=GOOGLE_API_KEY)

   config = types.LiveConnectConfig(
       response_modalities=["AUDIO"],
       realtime_input_config=types.RealtimeInputConfig(
           automatic_activity_detection=types.AutomaticActivityDetection(
               end_of_speech_sensitivity=types.EndSensitivity.END_SENSITIVITY_LOW,
               silence_duration_ms=SILENCE_DURATION_MS,
           )
       ),
   )

   print(f"Connecting to gemini-3.1-flash-live-preview with silence_duration_ms={SILENCE_DURATION_MS}")
   print(f"Speak, then stop. Expected: about {SILENCE_DURATION_MS/1000} seconds before Gemini responds.")
   print("Press Ctrl+C to exit.")

   async with client.aio.live.connect(
       model="gemini-3.1-flash-live-preview",
       config=config,
   ) as session:
       receive_task = asyncio.create_task(receive_responses(session))

       async for pcm_chunk in microphone_pcm_chunks():
           await session.send_realtime_input(
               audio=types.Blob(data=pcm_chunk, mime_type="audio/pcm;rate=16000")
           )

       await receive_task


if __name__ == "__main__":
   asyncio.run(main())
  1. Speak a short utterance and then stop speaking.

Expected behavior

With SILENCE_DURATION_MS = 10_000, Gemini Live automatic activity detection should wait for
roughly 10 seconds of detected non-speech before committing end-of-speech and starting the
model response.

Actual behavior

Gemini Live responds almost immediately, or much sooner than 10_000 ms, after speech stops.

Metadata

Metadata

Labels

priority: p2Moderately-important priority. Fix may not be included in next release.type: bugError or flaw in code with unintended results or allowing sub-optimal usage patterns.

Type

No type
No fields configured for issues without a type.

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions