|
1 | | -"""AudioIO - Clean separation of audio functionality from core BidiAgent. |
| 1 | +"""Send and receive audio data from devices. |
2 | 2 |
|
3 | | -Provides audio input/output capabilities for BidiAgent through the BidiIO protocol. |
4 | | -Handles all PyAudio setup, streaming, and cleanup while keeping the core agent data-agnostic. |
| 3 | +Reads user audio from input device and sends agent audio to output device using PyAudio. If a user interrupts the agent, |
| 4 | +the output buffer is cleared to stop playback. |
5 | 5 | """ |
6 | 6 |
|
7 | 7 | import asyncio |
8 | 8 | import base64 |
9 | 9 | import logging |
| 10 | +from collections import deque |
| 11 | +from typing import Any |
10 | 12 |
|
11 | 13 | import pyaudio |
12 | 14 |
|
13 | 15 | from ..types.io import BidiInput, BidiOutput |
14 | | -from ..types.events import BidiAudioInputEvent, BidiAudioStreamEvent, BidiOutputEvent |
| 16 | +from ..types.events import BidiAudioInputEvent, BidiAudioStreamEvent, BidiInterruptionEvent, BidiOutputEvent |
15 | 17 |
|
16 | 18 | logger = logging.getLogger(__name__) |
17 | 19 |
|
18 | 20 |
|
19 | 21 | class _BidiAudioInput(BidiInput): |
20 | | - """Handle audio input from bidi agent.""" |
21 | | - def __init__(self, audio: "BidiAudioIO") -> None: |
22 | | - """Store reference to pyaudio instance.""" |
23 | | - self.audio = audio |
24 | | - |
| 22 | + """Handle audio input from user. |
| 23 | + |
| 24 | + Attributes: |
| 25 | + _audio: PyAudio instance for audio system access. |
| 26 | + _stream: Audio input stream. |
| 27 | + """ |
| 28 | + |
| 29 | + _audio: pyaudio.PyAudio |
| 30 | + _stream: pyaudio.Stream |
| 31 | + |
| 32 | + _CHANNELS: int = 1 |
| 33 | + _DEVICE_INDEX: int | None = None |
| 34 | + _ENCODING: str = "pcm" |
| 35 | + _FORMAT: int = pyaudio.paInt16 |
| 36 | + _FRAMES_PER_BUFFER: int = 512 |
| 37 | + _RATE: int = 16000 |
| 38 | + |
| 39 | + def __init__(self, config: dict[str, Any]) -> None: |
| 40 | + """Extract configs.""" |
| 41 | + self._channels = config.get("input_channels", _BidiAudioInput._CHANNELS) |
| 42 | + self._device_index = config.get("input_device_index", _BidiAudioInput._DEVICE_INDEX) |
| 43 | + self._format = config.get("input_format", _BidiAudioInput._FORMAT) |
| 44 | + self._frames_per_buffer = config.get("input_frames_per_buffer", _BidiAudioInput._FRAMES_PER_BUFFER) |
| 45 | + self._rate = config.get("input_rate", _BidiAudioInput._RATE) |
| 46 | + |
25 | 47 | async def start(self) -> None: |
26 | | - """Start audio input.""" |
27 | | - self.audio._start() |
| 48 | + """Start input stream.""" |
| 49 | + self._audio = pyaudio.PyAudio() |
| 50 | + self._stream = self._audio.open( |
| 51 | + channels=self._channels, |
| 52 | + format=self._format, |
| 53 | + frames_per_buffer=self._frames_per_buffer, |
| 54 | + input=True, |
| 55 | + input_device_index=self._device_index, |
| 56 | + rate=self._rate, |
| 57 | + ) |
28 | 58 |
|
29 | 59 | async def stop(self) -> None: |
30 | | - """Stop audio input.""" |
31 | | - self.audio._stop() |
| 60 | + """Stop input stream.""" |
| 61 | + # TODO: Provide time for streaming thread to exit cleanly to prevent conflicts with the Nova threads. |
| 62 | + # See if we can remove after properly handling cancellation for agent. |
| 63 | + await asyncio.sleep(0.1) |
| 64 | + |
| 65 | + self._stream.close() |
| 66 | + self._audio.terminate() |
| 67 | + |
| 68 | + self._stream = None |
| 69 | + self._audio = None |
32 | 70 |
|
33 | 71 | async def __call__(self) -> BidiAudioInputEvent: |
34 | | - """Read audio from microphone.""" |
35 | | - audio_bytes = self.audio.input_stream.read(self.audio.chunk_size, exception_on_overflow=False) |
| 72 | + """Read audio from input stream.""" |
| 73 | + audio_bytes = await asyncio.to_thread( |
| 74 | + self._stream.read, self._frames_per_buffer, exception_on_overflow=False |
| 75 | + ) |
36 | 76 |
|
37 | 77 | return BidiAudioInputEvent( |
38 | 78 | audio=base64.b64encode(audio_bytes).decode("utf-8"), |
39 | | - format="pcm", |
40 | | - sample_rate=self.audio.input_sample_rate, |
41 | | - channels=self.audio.input_channels, |
| 79 | + channels=self._channels, |
| 80 | + format=_BidiAudioInput._ENCODING, |
| 81 | + sample_rate=self._rate, |
42 | 82 | ) |
43 | 83 |
|
44 | 84 |
|
45 | 85 | class _BidiAudioOutput(BidiOutput): |
46 | | - """Handle audio output from bidi agent.""" |
47 | | - def __init__(self, audio: "BidiAudioIO") -> None: |
48 | | - """Store reference to pyaudio instance.""" |
49 | | - self.audio = audio |
| 86 | + """Handle audio output from bidi agent. |
| 87 | + |
| 88 | + Attributes: |
| 89 | + _audio: PyAudio instance for audio system access. |
| 90 | + _stream: Audio output stream. |
| 91 | + _buffer: Deque buffer for queuing audio data. |
| 92 | + _buffer_event: Event to signal when buffer has data. |
| 93 | + _output_task: Background task for processing audio output. |
| 94 | + """ |
| 95 | + |
| 96 | + _audio: pyaudio.PyAudio |
| 97 | + _stream: pyaudio.Stream |
| 98 | + _buffer: deque |
| 99 | + _buffer_event: asyncio.Event |
| 100 | + _output_task: asyncio.Task |
| 101 | + |
| 102 | + _BUFFER_SIZE: int | None = None |
| 103 | + _CHANNELS: int = 1 |
| 104 | + _DEVICE_INDEX: int | None = None |
| 105 | + _FORMAT: int = pyaudio.paInt16 |
| 106 | + _FRAMES_PER_BUFFER: int = 512 |
| 107 | + _RATE: int = 16000 |
| 108 | + |
| 109 | + def __init__(self, config: dict[str, Any]) -> None: |
| 110 | + """Extract configs.""" |
| 111 | + self._buffer_size = config.get("output_buffer_size", _BidiAudioOutput._BUFFER_SIZE) |
| 112 | + self._channels = config.get("output_channels", _BidiAudioOutput._CHANNELS) |
| 113 | + self._device_index = config.get("output_device_index", _BidiAudioOutput._DEVICE_INDEX) |
| 114 | + self._format = config.get("output_format", _BidiAudioOutput._FORMAT) |
| 115 | + self._frames_per_buffer = config.get("output_frames_per_buffer", _BidiAudioOutput._FRAMES_PER_BUFFER) |
| 116 | + self._rate = config.get("output_rate", _BidiAudioOutput._RATE) |
50 | 117 |
|
51 | 118 | async def start(self) -> None: |
52 | | - """Start audio output.""" |
53 | | - self.audio._start() |
| 119 | + """Start output stream.""" |
| 120 | + self._audio = pyaudio.PyAudio() |
| 121 | + self._stream = self._audio.open( |
| 122 | + channels=self._channels, |
| 123 | + format=self._format, |
| 124 | + frames_per_buffer=self._frames_per_buffer, |
| 125 | + output=True, |
| 126 | + output_device_index=self._device_index, |
| 127 | + rate=self._rate, |
| 128 | + ) |
| 129 | + self._buffer = deque(maxlen=self._buffer_size) |
| 130 | + self._buffer_event = asyncio.Event() |
| 131 | + self._output_task = asyncio.create_task(self._output()) |
54 | 132 |
|
55 | 133 | async def stop(self) -> None: |
56 | | - """Stop audio output.""" |
57 | | - self.audio._stop() |
| 134 | + """Stop output stream.""" |
| 135 | + self._buffer.clear() |
| 136 | + self._buffer.append(None) |
| 137 | + self._buffer_event.set() |
| 138 | + await self._output_task |
| 139 | + |
| 140 | + self._stream.close() |
| 141 | + self._audio.terminate() |
| 142 | + |
| 143 | + self._output_task = None |
| 144 | + self._buffer = None |
| 145 | + self._buffer_event = None |
| 146 | + self._stream = None |
| 147 | + self._audio = None |
58 | 148 |
|
59 | 149 | async def __call__(self, event: BidiOutputEvent) -> None: |
60 | 150 | """Handle audio events with direct stream writing.""" |
61 | 151 | if isinstance(event, BidiAudioStreamEvent): |
62 | | - self.audio.output_stream.write(base64.b64decode(event["audio"])) |
| 152 | + audio_bytes = base64.b64decode(event["audio"]) |
| 153 | + self._buffer.append(audio_bytes) |
| 154 | + self._buffer_event.set() |
| 155 | + |
| 156 | + elif isinstance(event, BidiInterruptionEvent): |
| 157 | + self._buffer.clear() |
| 158 | + self._buffer_event.clear() |
63 | 159 |
|
64 | | - # TODO: Outputing audio to speakers is a sync operation. Adding sleep to prevent event loop hogging. Will |
65 | | - # follow up on identifying a cleaner approach. |
66 | | - await asyncio.sleep(0.01) |
| 160 | + async def _output(self) -> None: |
| 161 | + while True: |
| 162 | + await self._buffer_event.wait() |
| 163 | + self._buffer_event.clear() |
| 164 | + |
| 165 | + while self._buffer: |
| 166 | + audio_bytes = self._buffer.popleft() |
| 167 | + if not audio_bytes: |
| 168 | + return |
| 169 | + |
| 170 | + await asyncio.to_thread(self._stream.write, audio_bytes) |
67 | 171 |
|
68 | 172 |
|
69 | 173 | class BidiAudioIO: |
70 | | - """Audio IO channel for BidiAgent with direct stream processing.""" |
| 174 | + """Send and receive audio data from devices.""" |
71 | 175 |
|
72 | | - def __init__( |
73 | | - self, |
74 | | - audio_config: dict | None = None, |
75 | | - ): |
76 | | - """Initialize AudioIO with clean audio configuration. |
| 176 | + def __init__(self, **config: Any) -> None: |
| 177 | + """Initialize audio devices. |
77 | 178 |
|
78 | 179 | Args: |
79 | | - audio_config: Dictionary containing audio configuration: |
80 | | - - input_sample_rate (int): Microphone sample rate (default: 24000) |
81 | | - - output_sample_rate (int): Speaker sample rate (default: 24000) |
82 | | - - chunk_size (int): Audio chunk size in bytes (default: 1024) |
83 | | - - input_device_index (int): Specific input device (optional) |
84 | | - - output_device_index (int): Specific output device (optional) |
| 180 | + **config: Dictionary containing audio configuration: |
85 | 181 | - input_channels (int): Input channels (default: 1) |
| 182 | + - input_device_index (int): Specific input device (optional) |
| 183 | + - input_format (int): Audio format (default: paInt16) |
| 184 | + - input_frames_per_buffer (int): Frames per buffer (default: 512) |
| 185 | + - input_rate (int): Input sample rate (default: 16000) |
| 186 | + - output_buffer_size (int): Maximum output buffer size (default: None) |
86 | 187 | - output_channels (int): Output channels (default: 1) |
| 188 | + - output_device_index (int): Specific output device (optional) |
| 189 | + - output_format (int): Audio format (default: paInt16) |
| 190 | + - output_frames_per_buffer (int): Frames per buffer (default: 512) |
| 191 | + - output_rate (int): Output sample rate (default: 16000) |
87 | 192 | """ |
88 | | - default_config = { |
89 | | - "input_sample_rate": 16000, |
90 | | - "output_sample_rate": 16000, |
91 | | - "chunk_size": 512, |
92 | | - "input_device_index": None, |
93 | | - "output_device_index": None, |
94 | | - "input_channels": 1, |
95 | | - "output_channels": 1, |
96 | | - } |
97 | | - |
98 | | - # Merge user config with defaults |
99 | | - if audio_config: |
100 | | - default_config.update(audio_config) |
101 | | - |
102 | | - # Set audio configuration attributes |
103 | | - self.input_sample_rate = default_config["input_sample_rate"] |
104 | | - self.output_sample_rate = default_config["output_sample_rate"] |
105 | | - self.chunk_size = default_config["chunk_size"] |
106 | | - self.input_device_index = default_config["input_device_index"] |
107 | | - self.output_device_index = default_config["output_device_index"] |
108 | | - self.input_channels = default_config["input_channels"] |
109 | | - self.output_channels = default_config["output_channels"] |
110 | | - |
111 | | - # Audio infrastructure |
112 | | - self.audio = None |
113 | | - self.input_stream = None |
114 | | - self.output_stream = None |
115 | | - self.interrupted = False |
| 193 | + self._config = config |
116 | 194 |
|
117 | 195 | def input(self) -> _BidiAudioInput: |
118 | 196 | """Return audio processing BidiInput""" |
119 | | - return _BidiAudioInput(self) |
| 197 | + return _BidiAudioInput(self._config) |
120 | 198 |
|
121 | 199 | def output(self) -> _BidiAudioOutput: |
122 | 200 | """Return audio processing BidiOutput""" |
123 | | - return _BidiAudioOutput(self) |
124 | | - |
125 | | - def _start(self) -> None: |
126 | | - """Setup PyAudio streams for input and output.""" |
127 | | - if self.audio: |
128 | | - return |
129 | | - |
130 | | - self.audio = pyaudio.PyAudio() |
131 | | - |
132 | | - self.input_stream = self.audio.open( |
133 | | - format=pyaudio.paInt16, |
134 | | - channels=self.input_channels, |
135 | | - rate=self.input_sample_rate, |
136 | | - input=True, |
137 | | - frames_per_buffer=self.chunk_size, |
138 | | - input_device_index=self.input_device_index, |
139 | | - ) |
140 | | - |
141 | | - self.output_stream = self.audio.open( |
142 | | - format=pyaudio.paInt16, |
143 | | - channels=self.output_channels, |
144 | | - rate=self.output_sample_rate, |
145 | | - output=True, |
146 | | - frames_per_buffer=self.chunk_size, |
147 | | - output_device_index=self.output_device_index, |
148 | | - ) |
149 | | - |
150 | | - def _stop(self) -> None: |
151 | | - """Clean up IO channel resources.""" |
152 | | - if not self.audio: |
153 | | - return |
154 | | - |
155 | | - self.input_stream.close() |
156 | | - self.output_stream.close() |
157 | | - self.audio.terminate() |
158 | | - |
159 | | - self.input_stream = None |
160 | | - self.output_stream = None |
161 | | - self.audio = None |
| 201 | + return _BidiAudioOutput(self._config) |
0 commit comments