Skip to content

Commit 68e4028

Browse files
committed
adjust TTFB calculation and speaker metrics
1 parent d267d64 commit 68e4028

2 files changed

Lines changed: 56 additions & 46 deletions

File tree

sdk/voice/speechmatics/voice/_client.py

Lines changed: 56 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
from ._models import SpeakerFocusConfig
4747
from ._models import SpeakerFocusMode
4848
from ._models import SpeakerMetricsMessage
49+
from ._models import SpeakerSegment
4950
from ._models import SpeakerSegmentView
5051
from ._models import SpeechFragment
5152
from ._models import SpeechSegmentEmitMode
@@ -780,6 +781,11 @@ def _calculate_ttfb(self, end_time: float) -> None:
780781
Args:
781782
end_time: The end time of the payload from the STT engine.
782783
"""
784+
785+
# Skip if not enabled
786+
if not (self.listeners(AgentServerMessageType.TTFB_METRICS) or self.listeners(AgentServerMessageType.METRICS)):
787+
return
788+
783789
# Skip if no fragments are words
784790
if len(self._speech_fragments) == 0 or all(f.type_ != "word" for f in self._speech_fragments):
785791
return
@@ -807,6 +813,48 @@ def _calculate_ttfb(self, end_time: float) -> None:
807813
TTFBMetricsMessage(ttfb=int(self._last_ttfb)),
808814
)
809815

816+
def calculate_speaker_metrics(self, final_segments: list[SpeakerSegment]) -> None:
817+
"""Calculate the speaker metrics.
818+
819+
Used to track the number of final words per speaker. Only valid speakers are
820+
considered. Ignored speakers will be excluded.
821+
822+
Args:
823+
final_segments: The final segments to calculate the speaker metrics for.
824+
"""
825+
826+
# Skip if not enabled
827+
if not self.listeners(AgentServerMessageType.SPEAKER_METRICS):
828+
return
829+
830+
# Finalized words
831+
final_words = [
832+
f for seg in final_segments for f in seg.fragments if f.type_ == "word" and f.speaker is not None
833+
]
834+
835+
# Only process if we have words
836+
if final_words:
837+
# Update the metrics of the speakers in the session
838+
for frag in final_words:
839+
# Check we have a speaker
840+
if frag.speaker is None:
841+
continue
842+
843+
# Create new speaker
844+
if frag.speaker not in self._session_speakers:
845+
self._session_speakers[frag.speaker] = SessionSpeaker(speaker_id=frag.speaker)
846+
847+
# Update metrics
848+
self._session_speakers[frag.speaker].word_count += 1
849+
self._session_speakers[frag.speaker].last_heard = frag.end_time
850+
851+
# Emit
852+
self._emit_message(
853+
SpeakerMetricsMessage(
854+
speakers=list(self._session_speakers.values()),
855+
),
856+
)
857+
810858
# ============================================================================
811859
# TRANSCRIPT PROCESSING
812860
# ============================================================================
@@ -950,9 +998,7 @@ async def _add_speech_fragments(self, message: dict[str, Any], is_final: bool =
950998
self._logger.debug(json.dumps(debug_payload))
951999

9521000
# Update TTFB (only if there are listeners)
953-
if not is_final and (
954-
self.listeners(AgentServerMessageType.TTFB_METRICS) or self.listeners(AgentServerMessageType.METRICS)
955-
):
1001+
if not is_final:
9561002
self._calculate_ttfb(end_time=payload_end_time)
9571003

9581004
# Fragments available
@@ -1172,39 +1218,8 @@ async def _emit_segments(self, finalize: bool = False, end_of_turn: bool = False
11721218
self._turn_start_time = self._current_view.start_time
11731219

11741220
# Send updated speaker metrics
1175-
if self._dz_enabled and self.listeners(AgentServerMessageType.SPEAKER_METRICS):
1176-
"""Update the metrics of the speakers in the sesseion."""
1177-
1178-
# Finalized words
1179-
final_words = [
1180-
f
1181-
for seg in final_segments
1182-
for f in seg.fragments
1183-
if f.type_ == "word" and f.speaker is not None
1184-
]
1185-
1186-
# Only process if we have words
1187-
if final_words:
1188-
# Update the metrics of the speakers in the session
1189-
for frag in final_words:
1190-
# Check we have a speaker
1191-
if frag.speaker is None:
1192-
continue
1193-
1194-
# Create new speaker
1195-
if frag.speaker not in self._session_speakers:
1196-
self._session_speakers[frag.speaker] = SessionSpeaker(speaker_id=frag.speaker)
1197-
1198-
# Update metrics
1199-
self._session_speakers[frag.speaker].word_count += 1
1200-
self._session_speakers[frag.speaker].last_heard = frag.end_time
1201-
1202-
# Emit
1203-
self._emit_message(
1204-
SpeakerMetricsMessage(
1205-
speakers=list(self._session_speakers.values()),
1206-
),
1207-
)
1221+
if self._dz_enabled:
1222+
self.calculate_speaker_metrics(final_segments)
12081223

12091224
# Emit END_OF_TURN
12101225
if end_of_turn and self._previous_view:
@@ -1265,7 +1280,6 @@ async def _calculate_finalize_delay(
12651280
# Calculations
12661281
clamped_delay: float = self._config.end_of_utterance_max_delay
12671282
finalize_delay: Optional[float] = None
1268-
time_slip: Optional[float] = None
12691283

12701284
# Reasons for the calculation
12711285
reasons: list[tuple[float, str]] = []
@@ -1327,11 +1341,8 @@ def add_multipler_reason(multiplier: float, reason: str) -> None:
13271341
# Clamp to max delay
13281342
clamped_delay = min(delay, self._config.end_of_utterance_max_delay)
13291343

1330-
# Establish the real-world time
1331-
time_slip = max(self._total_time - self._last_fragment_end_time, 0)
1332-
13331344
# Adjust time and make sure no less than 25ms
1334-
finalize_delay = max(clamped_delay - time_slip, 0.025)
1345+
finalize_delay = max(clamped_delay - (self._last_ttfb / 1000), 0.025)
13351346

13361347
# Emit prediction
13371348
if self.listeners(AgentServerMessageType.END_OF_TURN_PREDICTION):
@@ -1340,7 +1351,6 @@ def add_multipler_reason(multiplier: float, reason: str) -> None:
13401351
turn_id=self._turn_id,
13411352
metadata=TurnPredictionMetadata(
13421353
ttl=round(finalize_delay, 2),
1343-
time_slip=round(time_slip, 2),
13441354
reasons=[_reason for _, _reason in reasons],
13451355
),
13461356
),
@@ -1459,12 +1469,14 @@ def _vad_evaluation(self, fragments: list[SpeechFragment]) -> None:
14591469
),
14601470
)
14611471

1472+
# Update current speaker
1473+
self._current_speaker = latest_speaker
1474+
14621475
# No further processing if we have no new fragments and we are not speaking
14631476
if has_valid_partial == current_is_speaking:
14641477
return
14651478

1466-
# Update current speaker + speaking states
1467-
self._current_speaker = latest_speaker
1479+
# Update speaking state
14681480
self._is_speaking = not current_is_speaking
14691481

14701482
# Event time

sdk/voice/speechmatics/voice/_models.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1030,12 +1030,10 @@ class TurnPredictionMetadata(BaseMessageModel):
10301030
10311031
Parameters:
10321032
ttl: The time to live of the prediction in seconds.
1033-
time_slip: The time slip of the prediction in seconds.
10341033
reasons: The reasons for the prediction.
10351034
"""
10361035

10371036
ttl: float
1038-
time_slip: float
10391037
reasons: list[str]
10401038

10411039

0 commit comments

Comments
 (0)