gemini-3.1-flash-live-preview ignores automatic_activity_detection.silence_duration_ms

 #### Environment details

- Programming language: Python
- OS: macOS, Apple Silicon
- Language runtime version: Python 3.12.12
- Package version: google-genai 2.8.0 from local `python-genai` checkout

#### Steps to reproduce

1. Install dependencies:
```bash
python -m pip install google-genai sounddevice
export GOOGLE_API_KEY="..."
```

2. Save and run this script:
<details>
  <summary>Full runnable repro script</summary>

```python

from __future__ import annotations
import asyncio
import os
import re
import sys

from google import genai
from google.genai import types


GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY") or os.environ.get("GEMINI_API_KEY")
INPUT_RATE = 16000
OUTPUT_RATE = 24000
SILENCE_DURATION_MS = 10_000

def parse_audio_rate(mime_type: str | None, fallback: int) -> int:
   if not mime_type:
       return fallback
   match = re.search(r"rate=(\d+)", mime_type)
   return int(match.group(1)) if match else fallback


async def play_pcm_audio(audio: bytes, rate: int) -> None:
   import sounddevice as sd

   def write_audio() -> None:
       with sd.RawOutputStream(samplerate=rate, channels=1, dtype="int16") as stream:
           stream.write(audio)

   await asyncio.to_thread(write_audio)


async def microphone_pcm_chunks(rate: int = INPUT_RATE):
   import sounddevice as sd

   loop = asyncio.get_running_loop()
   queue: asyncio.Queue[bytes] = asyncio.Queue(maxsize=50)

   def callback(indata, frames, time_info, status) -> None:
       if status:
           print(f"input status: {status}", file=sys.stderr)
       chunk = bytes(indata)
       try:
           loop.call_soon_threadsafe(queue.put_nowait, chunk)
       except asyncio.QueueFull:
           pass

   with sd.RawInputStream(
       samplerate=rate,
       channels=1,
       dtype="int16",
       blocksize=int(rate * 0.1),
       callback=callback,
   ):
       while True:
           yield await queue.get()


async def receive_responses(session) -> None:
   while True:
       audio_chunks: list[bytes] = []
       audio_rate = OUTPUT_RATE

       async for msg in session.receive():
           if msg.server_content and msg.server_content.model_turn:
               for part in msg.server_content.model_turn.parts or []:
                   if part.inline_data and part.inline_data.data:
                       audio_chunks.append(part.inline_data.data)
                       audio_rate = parse_audio_rate(part.inline_data.mime_type,
                       audio_rate)

           if msg.server_content and msg.server_content.turn_complete:
               if audio_chunks:
                   await play_pcm_audio(b"".join(audio_chunks), audio_rate)
               break


async def main() -> None:
   if not GOOGLE_API_KEY:
       raise RuntimeError("Set GOOGLE_API_KEY or GEMINI_API_KEY.")

   client = genai.Client(api_key=GOOGLE_API_KEY)

   config = types.LiveConnectConfig(
       response_modalities=["AUDIO"],
       realtime_input_config=types.RealtimeInputConfig(
           automatic_activity_detection=types.AutomaticActivityDetection(
               end_of_speech_sensitivity=types.EndSensitivity.END_SENSITIVITY_LOW,
               silence_duration_ms=SILENCE_DURATION_MS,
           )
       ),
   )

   print(f"Connecting to gemini-3.1-flash-live-preview with silence_duration_ms={SILENCE_DURATION_MS}")
   print(f"Speak, then stop. Expected: about {SILENCE_DURATION_MS/1000} seconds before Gemini responds.")
   print("Press Ctrl+C to exit.")

   async with client.aio.live.connect(
       model="gemini-3.1-flash-live-preview",
       config=config,
   ) as session:
       receive_task = asyncio.create_task(receive_responses(session))

       async for pcm_chunk in microphone_pcm_chunks():
           await session.send_realtime_input(
               audio=types.Blob(data=pcm_chunk, mime_type="audio/pcm;rate=16000")
           )

       await receive_task


if __name__ == "__main__":
   asyncio.run(main())
```

</details>

3. Speak a short utterance and then stop speaking.

#### Expected behavior

With `SILENCE_DURATION_MS = 10_000`, Gemini Live automatic activity detection should wait for
roughly 10 seconds of detected non-speech before committing end-of-speech and starting the
model response.

#### Actual behavior

Gemini Live responds almost immediately, or much sooner than 10_000 ms, after speech stops.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

gemini-3.1-flash-live-preview ignores automatic_activity_detection.silence_duration_ms #2580

Environment details

Steps to reproduce

Expected behavior

Actual behavior

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

gemini-3.1-flash-live-preview ignores automatic_activity_detection.silence_duration_ms #2580

Description

Environment details

Steps to reproduce

Expected behavior

Actual behavior

Metadata

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Issue actions