diff --git a/src/sdialog/audio/pipeline.py b/src/sdialog/audio/pipeline.py index 09af4b6..28b7617 100644 --- a/src/sdialog/audio/pipeline.py +++ b/src/sdialog/audio/pipeline.py @@ -52,18 +52,19 @@ from tqdm import tqdm import soundfile as sf from datasets import load_dataset -from typing import List, Optional, Union, Callable +from typing import List, Optional, Union, Callable, Any from sdialog import Dialog from sdialog.audio.utils import logger from sdialog.audio.dialog import AudioDialog from sdialog.audio.processing import AudioProcessor +from sdialog.audio import generate_utterances_audios from sdialog.audio.normalizers import normalize_audio from sdialog.audio.jsalt import MedicalRoomGenerator, RoomRole from sdialog.audio.room import Room, RoomPosition, DirectivityType from sdialog.audio.tts import BaseTTS, Qwen3TTS, Qwen3TTSVoiceClone +from sdialog.audio.room_acoustics_backends import resolve_room_acoustics_backend from sdialog.audio.voice_database import Voice, BaseVoiceDatabase, HuggingfaceVoiceDatabase -from sdialog.audio import generate_utterances_audios, generate_audio_room_accoustic from sdialog.audio.impulse_response_database import ImpulseResponseDatabase, RecordingDevice from sdialog.audio.utils import ( Role, @@ -108,7 +109,9 @@ def to_audio( remove_silences: Optional[bool] = True, normalize: Optional[bool] = True, callback_mix_fn: Optional[Callable] = None, - callback_mix_kwargs: dict = {} + callback_mix_kwargs: dict = {}, + room_acoustics_backend: Optional[Any] = None, + room_acoustics_backend_kwargs: Optional[dict] = None, ) -> AudioDialog: """ Convert a dialogue into an audio dialogue with comprehensive audio processing. @@ -191,6 +194,13 @@ def to_audio( :type callback_mix_fn: Optional[Callable] :param callback_mix_kwargs: Keyword arguments for the callback function. :type callback_mix_kwargs: dict + :param room_acoustics_backend: Backend used in step 3 for room acoustics simulation. + Supports None (defaults to PyroomAcousticsBackend), + a backend class/instance, or an object exposing simulate(...). + :type room_acoustics_backend: Optional[Any] + :param room_acoustics_backend_kwargs: Optional kwargs used to instantiate/configure + the room acoustics backend. + :type room_acoustics_backend_kwargs: Optional[dict] :return: Audio dialogue with processed audio data. :rtype: AudioDialog """ @@ -272,18 +282,26 @@ def to_audio( if perform_room_acoustics: + # Resolve the room acoustics backend + _acoustics_backend = resolve_room_acoustics_backend( + room_acoustics_backend, + room_acoustics_backend_kwargs + ) + # Place the speakers around the furnitures in the room - for _role, _kwargs in speaker_positions.items(): + if isinstance(room, Room): - if _role in room.speakers_positions: - continue + for _role, _kwargs in speaker_positions.items(): - room.place_speaker_around_furniture( - speaker_name=_role, - furniture_name=_kwargs["furniture_name"], - max_distance=_kwargs["max_distance"], - side=_kwargs["side"] - ) + if _role in room.speakers_positions: + continue + + room.place_speaker_around_furniture( + speaker_name=_role, + furniture_name=_kwargs["furniture_name"], + max_distance=_kwargs["max_distance"], + side=_kwargs["side"] + ) _environment = { "room": room, @@ -291,7 +309,8 @@ def to_audio( "foreground_effect": foreground_effect, "foreground_effect_position": foreground_effect_position, "source_volumes": source_volumes, - "kwargs_pyroom": kwargs_pyroom + "kwargs_pyroom": kwargs_pyroom, + "room_acoustics_backend": _acoustics_backend } else: @@ -318,6 +337,8 @@ def to_audio( normalize=normalize, callback_mix_fn=callback_mix_fn, callback_mix_kwargs=callback_mix_kwargs, + room_acoustics_backend=room_acoustics_backend, + room_acoustics_backend_kwargs=room_acoustics_backend_kwargs, ) finally: @@ -547,7 +568,9 @@ def inference( remove_silences: Optional[bool] = True, normalize: Optional[bool] = True, callback_mix_fn: Optional[Callable] = None, - callback_mix_kwargs: dict = {} + callback_mix_kwargs: dict = {}, + room_acoustics_backend: Optional[Any] = None, + room_acoustics_backend_kwargs: Optional[dict] = None, ) -> AudioDialog: """ Execute the complete audio generation pipeline. @@ -607,6 +630,13 @@ def inference( :type callback_mix_fn: Optional[Callable] :param callback_mix_kwargs: Keyword arguments for the callback function. :type callback_mix_kwargs: dict + :param room_acoustics_backend: Backend used in step 3 for room acoustics simulation. + Supports None (defaults to PyroomAcousticsBackend), + a backend class/instance, or an object exposing simulate(...). + :type room_acoustics_backend: Optional[Any] + :param room_acoustics_backend_kwargs: Optional kwargs used to instantiate/configure + the room acoustics backend. + :type room_acoustics_backend_kwargs: Optional[dict] :return: Processed audio dialogue with all audio data. :rtype: AudioDialog @@ -649,8 +679,14 @@ def inference( else: logger.info(f"[Initialization] Audio file format for generation is set to {audio_file_format}") + _env_backend = environment.get("room_acoustics_backend") if environment is not None else None + _backend = resolve_room_acoustics_backend( + room_acoustics_backend if room_acoustics_backend is not None else _env_backend, + room_acoustics_backend_kwargs + ) + # Create variables from room from the environment - room: Room = ( + room: Any = ( environment["room"] if environment is not None and "room" in environment @@ -664,6 +700,8 @@ def inference( and environment["kwargs_pyroom"] is not None and "ray_tracing" in environment["kwargs_pyroom"] and environment["kwargs_pyroom"]["ray_tracing"] + and isinstance(room, Room) + and _backend.name == "pyroom" and room.directivity_type is not None and room.directivity_type != DirectivityType.OMNIDIRECTIONAL ): @@ -844,8 +882,8 @@ def inference( logger.info("[Step 3] Starting...") - if not isinstance(environment["room"], Room): - raise ValueError("The room must be a Room object") + if _backend.requires_room and room is None: + raise ValueError(f"The selected acoustics backend '{_backend.name}' requires a room object.") # Check if the step 2 is not done if len(dialog.audio_step_2_filepath) < 1: @@ -864,34 +902,23 @@ def inference( logger.info(f"[Step 3] Generating room accoustic for dialogue {dialog.id}") # Override the room name if provided otherwise use the hash of the room - room_name = room_name if room_name is not None else room.name + room_name = ( + room_name + if room_name is not None + else (room.name if isinstance(room, Room) else _backend.name) + ) - # Generate the audio room accoustic from the dialog and room object - dialog: AudioDialog = generate_audio_room_accoustic( + # Generate step-3 audio using the selected acoustics backend. + dialog: AudioDialog = _backend.simulate( dialog=dialog, room=room, dialog_directory=dialog_directory, room_name=room_name, - kwargs_pyroom=environment["kwargs_pyroom"] if "kwargs_pyroom" in environment else {}, - source_volumes=environment["source_volumes"] if "source_volumes" in environment else {}, audio_file_format=audio_file_format, - background_effect=( - environment["background_effect"] - if "background_effect" in environment - else "white_noise" - ), - foreground_effect=( - environment["foreground_effect"] - if "foreground_effect" in environment - else "ac_noise_minimal" - ), - foreground_effect_position=( - environment["foreground_effect_position"] - if "foreground_effect_position" in environment - else RoomPosition.TOP_RIGHT - ), + environment=environment, callback_mix_fn=callback_mix_fn, callback_mix_kwargs=callback_mix_kwargs, + sampling_rate=self.sampling_rate, ) logger.info(f"[Step 3] Room accoustic has been generated successfully for dialogue {dialog.id}!") diff --git a/src/sdialog/audio/room_acoustics_backends/__init__.py b/src/sdialog/audio/room_acoustics_backends/__init__.py new file mode 100644 index 0000000..1d81e97 --- /dev/null +++ b/src/sdialog/audio/room_acoustics_backends/__init__.py @@ -0,0 +1,16 @@ +""" +Room acoustics backend package. + +This package exposes room acoustics backend contracts, built-in backends, +and the backend resolver utility. +""" + +from .base import BaseRoomAcousticsBackend +from .pyroomacoustics import PyroomAcousticsBackend +from .resolver import resolve_room_acoustics_backend + +__all__ = [ + "BaseRoomAcousticsBackend", + "PyroomAcousticsBackend", + "resolve_room_acoustics_backend", +] diff --git a/src/sdialog/audio/room_acoustics_backends/base.py b/src/sdialog/audio/room_acoustics_backends/base.py new file mode 100644 index 0000000..d047ff8 --- /dev/null +++ b/src/sdialog/audio/room_acoustics_backends/base.py @@ -0,0 +1,55 @@ +""" +Base room acoustics backend contract. +""" + +from abc import ABC, abstractmethod +from typing import Any, Callable, Optional + +from sdialog.audio.dialog import AudioDialog + + +class BaseRoomAcousticsBackend(ABC): + """ + Abstract base class for room acoustics backends. + """ + + requires_room: bool = True + name: str = "base" + + @abstractmethod + def simulate( + self, + dialog: AudioDialog, + room: Optional[Any], + dialog_directory: str, + room_name: str, + audio_file_format: str = "wav", + environment: Optional[dict] = None, + callback_mix_fn: Optional[Callable] = None, + callback_mix_kwargs: Optional[dict] = None, + sampling_rate: int = 44_100, + ) -> AudioDialog: + """ + Run room acoustics simulation and update the dialog outputs. + + :param dialog: Audio dialog object to update. + :type dialog: AudioDialog + :param room: Room configuration used for simulation. + :type room: Optional[Any] + :param dialog_directory: Relative output directory for generated files. + :type dialog_directory: str + :param room_name: Name of the room profile to generate. + :type room_name: str + :param audio_file_format: Audio format for exported files (default: "wav"). + :type audio_file_format: str + :param environment: Backend-specific environment parameters. + :type environment: Optional[dict] + :param callback_mix_fn: Optional callback used during audio mixing. + :type callback_mix_fn: Optional[Callable] + :param callback_mix_kwargs: Optional keyword arguments for the mix callback. + :type callback_mix_kwargs: Optional[dict] + :param sampling_rate: Sampling rate used for generated audio (default: 44100). + :type sampling_rate: int + :return: Updated dialog with room acoustics outputs. + :rtype: AudioDialog + """ diff --git a/src/sdialog/audio/room_acoustics_backends/pyroomacoustics.py b/src/sdialog/audio/room_acoustics_backends/pyroomacoustics.py new file mode 100644 index 0000000..970abfe --- /dev/null +++ b/src/sdialog/audio/room_acoustics_backends/pyroomacoustics.py @@ -0,0 +1,78 @@ +""" +Pyroomacoustics backend implementation. +""" + +from typing import Any, Callable, Optional + +from sdialog.audio.dialog import AudioDialog +from sdialog.audio.room import Room, RoomPosition + +from .base import BaseRoomAcousticsBackend + + +class PyroomAcousticsBackend(BaseRoomAcousticsBackend): + """ + Room acoustics backend using the existing pyroomacoustics flow. + """ + + requires_room = True + name = "pyroom" + + def simulate( + self, + dialog: AudioDialog, + room: Optional[Any], + dialog_directory: str, + room_name: str, + audio_file_format: str = "wav", + environment: Optional[dict] = None, + callback_mix_fn: Optional[Callable] = None, + callback_mix_kwargs: Optional[dict] = None, + sampling_rate: int = 44_100, + ) -> AudioDialog: + """ + Generate room acoustics audio with pyroomacoustics. + + :param dialog: Audio dialog object to update. + :type dialog: AudioDialog + :param room: Room configuration used for simulation. + :type room: Optional[Any] + :param dialog_directory: Relative output directory for generated files. + :type dialog_directory: str + :param room_name: Name of the room profile to generate. + :type room_name: str + :param audio_file_format: Audio format for exported files (default: "wav"). + :type audio_file_format: str + :param environment: Optional environment overrides for pyroom settings. + :type environment: Optional[dict] + :param callback_mix_fn: Optional callback used during audio mixing. + :type callback_mix_fn: Optional[Callable] + :param callback_mix_kwargs: Optional keyword arguments for the mix callback. + :type callback_mix_kwargs: Optional[dict] + :param sampling_rate: Unused argument kept for API compatibility. + :type sampling_rate: int + :return: Updated dialog with room acoustics outputs. + :rtype: AudioDialog + :raises ValueError: If ``room`` is not an instance of ``Room``. + """ + del sampling_rate + if not isinstance(room, Room): + raise ValueError("PyroomAcousticsBackend expects `room` to be an instance of `Room`.") + + from sdialog.audio import generate_audio_room_accoustic + + env = environment or {} + return generate_audio_room_accoustic( + dialog=dialog, + room=room, + dialog_directory=dialog_directory, + room_name=room_name, + kwargs_pyroom=env.get("kwargs_pyroom", {}), + source_volumes=env.get("source_volumes", {}), + audio_file_format=audio_file_format, + background_effect=env.get("background_effect", "white_noise"), + foreground_effect=env.get("foreground_effect", "ac_noise_minimal"), + foreground_effect_position=env.get("foreground_effect_position", RoomPosition.TOP_RIGHT), + callback_mix_fn=callback_mix_fn, + callback_mix_kwargs=callback_mix_kwargs or {}, + ) diff --git a/src/sdialog/audio/room_acoustics_backends/resolver.py b/src/sdialog/audio/room_acoustics_backends/resolver.py new file mode 100644 index 0000000..8a88b1a --- /dev/null +++ b/src/sdialog/audio/room_acoustics_backends/resolver.py @@ -0,0 +1,47 @@ +""" +Backend resolver for room acoustics simulation. +""" + +from typing import Any, Optional + +from .base import BaseRoomAcousticsBackend +from .pyroomacoustics import PyroomAcousticsBackend + + +def resolve_room_acoustics_backend( + room_acoustics_backend: Optional[Any] = None, + backend_kwargs: Optional[dict] = None, +) -> BaseRoomAcousticsBackend: + """ + Resolve room acoustics backend specification into a backend instance. + + :param room_acoustics_backend: Backend specification. Supports ``None``, backend classes, + backend instances, legacy aliases, or objects exposing ``simulate(...)``. + :type room_acoustics_backend: Optional[Any] + :param backend_kwargs: Keyword arguments used when instantiating backend classes. + :type backend_kwargs: Optional[dict] + :return: Resolved room acoustics backend instance. + :rtype: BaseRoomAcousticsBackend + :raises ValueError: If the backend specification is not supported. + """ + kwargs = backend_kwargs or {} + + if room_acoustics_backend is None: + return PyroomAcousticsBackend() + + if isinstance(room_acoustics_backend, BaseRoomAcousticsBackend): + return room_acoustics_backend + + if ( + isinstance(room_acoustics_backend, type) + and issubclass(room_acoustics_backend, BaseRoomAcousticsBackend) + ): + return room_acoustics_backend(**kwargs) + + if hasattr(room_acoustics_backend, "simulate") and callable(room_acoustics_backend.simulate): + return room_acoustics_backend + + raise ValueError( + "Unsupported `room_acoustics_backend`. Use None, a backend class/instance, " + "or an object exposing `simulate(...)`." + ) diff --git a/tests/test_audio.py b/tests/test_audio.py index 64c8779..16a24c3 100644 --- a/tests/test_audio.py +++ b/tests/test_audio.py @@ -690,7 +690,8 @@ def mock_dependencies(tmp_path): patch('sdialog.audio.dialog.AudioDialog.save_utterances_audios') as mock_save_utt, \ patch('sdialog.audio.pipeline.librosa', create=True) as mock_librosa, \ patch('sdialog.audio.pipeline.sf', create=True) as mock_sf, \ - patch('sdialog.audio.pipeline.generate_audio_room_accoustic') as mock_gen_room, \ + patch('sdialog.audio.room_acoustics_backends.pyroomacoustics.' + 'PyroomAcousticsBackend.simulate') as mock_gen_room, \ patch('sdialog.audio.pipeline.load_dataset') as mock_load_dataset, \ patch('sdialog.audio.dscaper_utils.send_utterances_to_dscaper') as mock_send_utt: