Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 64 additions & 37 deletions src/sdialog/audio/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,18 +52,19 @@
from tqdm import tqdm
import soundfile as sf
from datasets import load_dataset
from typing import List, Optional, Union, Callable
from typing import List, Optional, Union, Callable, Any

from sdialog import Dialog
from sdialog.audio.utils import logger
from sdialog.audio.dialog import AudioDialog
from sdialog.audio.processing import AudioProcessor
from sdialog.audio import generate_utterances_audios
from sdialog.audio.normalizers import normalize_audio
from sdialog.audio.jsalt import MedicalRoomGenerator, RoomRole
from sdialog.audio.room import Room, RoomPosition, DirectivityType
from sdialog.audio.tts import BaseTTS, Qwen3TTS, Qwen3TTSVoiceClone
from sdialog.audio.room_acoustics_backends import resolve_room_acoustics_backend
from sdialog.audio.voice_database import Voice, BaseVoiceDatabase, HuggingfaceVoiceDatabase
from sdialog.audio import generate_utterances_audios, generate_audio_room_accoustic
from sdialog.audio.impulse_response_database import ImpulseResponseDatabase, RecordingDevice
from sdialog.audio.utils import (
Role,
Expand Down Expand Up @@ -108,7 +109,9 @@ def to_audio(
remove_silences: Optional[bool] = True,
normalize: Optional[bool] = True,
callback_mix_fn: Optional[Callable] = None,
callback_mix_kwargs: dict = {}
callback_mix_kwargs: dict = {},
room_acoustics_backend: Optional[Any] = None,
room_acoustics_backend_kwargs: Optional[dict] = None,
) -> AudioDialog:
"""
Convert a dialogue into an audio dialogue with comprehensive audio processing.
Expand Down Expand Up @@ -191,6 +194,13 @@ def to_audio(
:type callback_mix_fn: Optional[Callable]
:param callback_mix_kwargs: Keyword arguments for the callback function.
:type callback_mix_kwargs: dict
:param room_acoustics_backend: Backend used in step 3 for room acoustics simulation.
Supports None (defaults to PyroomAcousticsBackend),
a backend class/instance, or an object exposing simulate(...).
:type room_acoustics_backend: Optional[Any]
:param room_acoustics_backend_kwargs: Optional kwargs used to instantiate/configure
the room acoustics backend.
:type room_acoustics_backend_kwargs: Optional[dict]
:return: Audio dialogue with processed audio data.
:rtype: AudioDialog
"""
Expand Down Expand Up @@ -272,26 +282,35 @@ def to_audio(

if perform_room_acoustics:

# Resolve the room acoustics backend
_acoustics_backend = resolve_room_acoustics_backend(
room_acoustics_backend,
room_acoustics_backend_kwargs
)

# Place the speakers around the furnitures in the room
for _role, _kwargs in speaker_positions.items():
if isinstance(room, Room):

if _role in room.speakers_positions:
continue
for _role, _kwargs in speaker_positions.items():

room.place_speaker_around_furniture(
speaker_name=_role,
furniture_name=_kwargs["furniture_name"],
max_distance=_kwargs["max_distance"],
side=_kwargs["side"]
)
if _role in room.speakers_positions:
continue

room.place_speaker_around_furniture(
speaker_name=_role,
furniture_name=_kwargs["furniture_name"],
max_distance=_kwargs["max_distance"],
side=_kwargs["side"]
)

_environment = {
"room": room,
"background_effect": background_effect,
"foreground_effect": foreground_effect,
"foreground_effect_position": foreground_effect_position,
"source_volumes": source_volumes,
"kwargs_pyroom": kwargs_pyroom
"kwargs_pyroom": kwargs_pyroom,
"room_acoustics_backend": _acoustics_backend
}

else:
Expand All @@ -318,6 +337,8 @@ def to_audio(
normalize=normalize,
callback_mix_fn=callback_mix_fn,
callback_mix_kwargs=callback_mix_kwargs,
room_acoustics_backend=room_acoustics_backend,
room_acoustics_backend_kwargs=room_acoustics_backend_kwargs,
)

finally:
Expand Down Expand Up @@ -547,7 +568,9 @@ def inference(
remove_silences: Optional[bool] = True,
normalize: Optional[bool] = True,
callback_mix_fn: Optional[Callable] = None,
callback_mix_kwargs: dict = {}
callback_mix_kwargs: dict = {},
room_acoustics_backend: Optional[Any] = None,
room_acoustics_backend_kwargs: Optional[dict] = None,
) -> AudioDialog:
"""
Execute the complete audio generation pipeline.
Expand Down Expand Up @@ -607,6 +630,13 @@ def inference(
:type callback_mix_fn: Optional[Callable]
:param callback_mix_kwargs: Keyword arguments for the callback function.
:type callback_mix_kwargs: dict
:param room_acoustics_backend: Backend used in step 3 for room acoustics simulation.
Supports None (defaults to PyroomAcousticsBackend),
a backend class/instance, or an object exposing simulate(...).
:type room_acoustics_backend: Optional[Any]
:param room_acoustics_backend_kwargs: Optional kwargs used to instantiate/configure
the room acoustics backend.
:type room_acoustics_backend_kwargs: Optional[dict]
:return: Processed audio dialogue with all audio data.
:rtype: AudioDialog

Expand Down Expand Up @@ -649,8 +679,14 @@ def inference(
else:
logger.info(f"[Initialization] Audio file format for generation is set to {audio_file_format}")

_env_backend = environment.get("room_acoustics_backend") if environment is not None else None
_backend = resolve_room_acoustics_backend(
room_acoustics_backend if room_acoustics_backend is not None else _env_backend,
room_acoustics_backend_kwargs
)

# Create variables from room from the environment
room: Room = (
room: Any = (
environment["room"]
if environment is not None
and "room" in environment
Expand All @@ -664,6 +700,8 @@ def inference(
and environment["kwargs_pyroom"] is not None
and "ray_tracing" in environment["kwargs_pyroom"]
and environment["kwargs_pyroom"]["ray_tracing"]
and isinstance(room, Room)
and _backend.name == "pyroom"
and room.directivity_type is not None
and room.directivity_type != DirectivityType.OMNIDIRECTIONAL
):
Expand Down Expand Up @@ -844,8 +882,8 @@ def inference(

logger.info("[Step 3] Starting...")

if not isinstance(environment["room"], Room):
raise ValueError("The room must be a Room object")
if _backend.requires_room and room is None:
raise ValueError(f"The selected acoustics backend '{_backend.name}' requires a room object.")

# Check if the step 2 is not done
if len(dialog.audio_step_2_filepath) < 1:
Expand All @@ -864,34 +902,23 @@ def inference(
logger.info(f"[Step 3] Generating room accoustic for dialogue {dialog.id}")

# Override the room name if provided otherwise use the hash of the room
room_name = room_name if room_name is not None else room.name
room_name = (
room_name
if room_name is not None
else (room.name if isinstance(room, Room) else _backend.name)
)

# Generate the audio room accoustic from the dialog and room object
dialog: AudioDialog = generate_audio_room_accoustic(
# Generate step-3 audio using the selected acoustics backend.
dialog: AudioDialog = _backend.simulate(
dialog=dialog,
room=room,
dialog_directory=dialog_directory,
room_name=room_name,
kwargs_pyroom=environment["kwargs_pyroom"] if "kwargs_pyroom" in environment else {},
source_volumes=environment["source_volumes"] if "source_volumes" in environment else {},
audio_file_format=audio_file_format,
background_effect=(
environment["background_effect"]
if "background_effect" in environment
else "white_noise"
),
foreground_effect=(
environment["foreground_effect"]
if "foreground_effect" in environment
else "ac_noise_minimal"
),
foreground_effect_position=(
environment["foreground_effect_position"]
if "foreground_effect_position" in environment
else RoomPosition.TOP_RIGHT
),
environment=environment,
callback_mix_fn=callback_mix_fn,
callback_mix_kwargs=callback_mix_kwargs,
sampling_rate=self.sampling_rate,
)

logger.info(f"[Step 3] Room accoustic has been generated successfully for dialogue {dialog.id}!")
Expand Down
16 changes: 16 additions & 0 deletions src/sdialog/audio/room_acoustics_backends/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
"""
Room acoustics backend package.

This package exposes room acoustics backend contracts, built-in backends,
and the backend resolver utility.
"""

from .base import BaseRoomAcousticsBackend
from .pyroomacoustics import PyroomAcousticsBackend
from .resolver import resolve_room_acoustics_backend

__all__ = [
"BaseRoomAcousticsBackend",
"PyroomAcousticsBackend",
"resolve_room_acoustics_backend",
]
55 changes: 55 additions & 0 deletions src/sdialog/audio/room_acoustics_backends/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""
Base room acoustics backend contract.
"""

from abc import ABC, abstractmethod
from typing import Any, Callable, Optional

from sdialog.audio.dialog import AudioDialog


class BaseRoomAcousticsBackend(ABC):
"""
Abstract base class for room acoustics backends.
"""

requires_room: bool = True
name: str = "base"

@abstractmethod
def simulate(
self,
dialog: AudioDialog,
room: Optional[Any],
dialog_directory: str,
room_name: str,
audio_file_format: str = "wav",
environment: Optional[dict] = None,
callback_mix_fn: Optional[Callable] = None,
callback_mix_kwargs: Optional[dict] = None,
sampling_rate: int = 44_100,
) -> AudioDialog:
"""
Run room acoustics simulation and update the dialog outputs.

:param dialog: Audio dialog object to update.
:type dialog: AudioDialog
:param room: Room configuration used for simulation.
:type room: Optional[Any]
:param dialog_directory: Relative output directory for generated files.
:type dialog_directory: str
:param room_name: Name of the room profile to generate.
:type room_name: str
:param audio_file_format: Audio format for exported files (default: "wav").
:type audio_file_format: str
:param environment: Backend-specific environment parameters.
:type environment: Optional[dict]
:param callback_mix_fn: Optional callback used during audio mixing.
:type callback_mix_fn: Optional[Callable]
:param callback_mix_kwargs: Optional keyword arguments for the mix callback.
:type callback_mix_kwargs: Optional[dict]
:param sampling_rate: Sampling rate used for generated audio (default: 44100).
:type sampling_rate: int
:return: Updated dialog with room acoustics outputs.
:rtype: AudioDialog
"""
78 changes: 78 additions & 0 deletions src/sdialog/audio/room_acoustics_backends/pyroomacoustics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""
Pyroomacoustics backend implementation.
"""

from typing import Any, Callable, Optional

from sdialog.audio.dialog import AudioDialog
from sdialog.audio.room import Room, RoomPosition

from .base import BaseRoomAcousticsBackend


class PyroomAcousticsBackend(BaseRoomAcousticsBackend):
"""
Room acoustics backend using the existing pyroomacoustics flow.
"""

requires_room = True
name = "pyroom"

def simulate(
self,
dialog: AudioDialog,
room: Optional[Any],
dialog_directory: str,
room_name: str,
audio_file_format: str = "wav",
environment: Optional[dict] = None,
callback_mix_fn: Optional[Callable] = None,
callback_mix_kwargs: Optional[dict] = None,
sampling_rate: int = 44_100,
) -> AudioDialog:
"""
Generate room acoustics audio with pyroomacoustics.

:param dialog: Audio dialog object to update.
:type dialog: AudioDialog
:param room: Room configuration used for simulation.
:type room: Optional[Any]
:param dialog_directory: Relative output directory for generated files.
:type dialog_directory: str
:param room_name: Name of the room profile to generate.
:type room_name: str
:param audio_file_format: Audio format for exported files (default: "wav").
:type audio_file_format: str
:param environment: Optional environment overrides for pyroom settings.
:type environment: Optional[dict]
:param callback_mix_fn: Optional callback used during audio mixing.
:type callback_mix_fn: Optional[Callable]
:param callback_mix_kwargs: Optional keyword arguments for the mix callback.
:type callback_mix_kwargs: Optional[dict]
:param sampling_rate: Unused argument kept for API compatibility.
:type sampling_rate: int
:return: Updated dialog with room acoustics outputs.
:rtype: AudioDialog
:raises ValueError: If ``room`` is not an instance of ``Room``.
"""
del sampling_rate
if not isinstance(room, Room):
raise ValueError("PyroomAcousticsBackend expects `room` to be an instance of `Room`.")

from sdialog.audio import generate_audio_room_accoustic

env = environment or {}
return generate_audio_room_accoustic(
dialog=dialog,
room=room,
dialog_directory=dialog_directory,
room_name=room_name,
kwargs_pyroom=env.get("kwargs_pyroom", {}),
source_volumes=env.get("source_volumes", {}),
audio_file_format=audio_file_format,
background_effect=env.get("background_effect", "white_noise"),
foreground_effect=env.get("foreground_effect", "ac_noise_minimal"),
foreground_effect_position=env.get("foreground_effect_position", RoomPosition.TOP_RIGHT),
callback_mix_fn=callback_mix_fn,
callback_mix_kwargs=callback_mix_kwargs or {},
)
Loading
Loading