4646from ._models import SpeakerFocusConfig
4747from ._models import SpeakerFocusMode
4848from ._models import SpeakerMetricsMessage
49+ from ._models import SpeakerSegment
4950from ._models import SpeakerSegmentView
5051from ._models import SpeechFragment
5152from ._models import SpeechSegmentEmitMode
@@ -780,6 +781,11 @@ def _calculate_ttfb(self, end_time: float) -> None:
780781 Args:
781782 end_time: The end time of the payload from the STT engine.
782783 """
784+
785+ # Skip if not enabled
786+ if not (self .listeners (AgentServerMessageType .TTFB_METRICS ) or self .listeners (AgentServerMessageType .METRICS )):
787+ return
788+
783789 # Skip if no fragments are words
784790 if len (self ._speech_fragments ) == 0 or all (f .type_ != "word" for f in self ._speech_fragments ):
785791 return
@@ -807,6 +813,48 @@ def _calculate_ttfb(self, end_time: float) -> None:
807813 TTFBMetricsMessage (ttfb = int (self ._last_ttfb )),
808814 )
809815
816+ def calculate_speaker_metrics (self , final_segments : list [SpeakerSegment ]) -> None :
817+ """Calculate the speaker metrics.
818+
819+ Used to track the number of final words per speaker. Only valid speakers are
820+ considered. Ignored speakers will be excluded.
821+
822+ Args:
823+ final_segments: The final segments to calculate the speaker metrics for.
824+ """
825+
826+ # Skip if not enabled
827+ if not self .listeners (AgentServerMessageType .SPEAKER_METRICS ):
828+ return
829+
830+ # Finalized words
831+ final_words = [
832+ f for seg in final_segments for f in seg .fragments if f .type_ == "word" and f .speaker is not None
833+ ]
834+
835+ # Only process if we have words
836+ if final_words :
837+ # Update the metrics of the speakers in the session
838+ for frag in final_words :
839+ # Check we have a speaker
840+ if frag .speaker is None :
841+ continue
842+
843+ # Create new speaker
844+ if frag .speaker not in self ._session_speakers :
845+ self ._session_speakers [frag .speaker ] = SessionSpeaker (speaker_id = frag .speaker )
846+
847+ # Update metrics
848+ self ._session_speakers [frag .speaker ].word_count += 1
849+ self ._session_speakers [frag .speaker ].last_heard = frag .end_time
850+
851+ # Emit
852+ self ._emit_message (
853+ SpeakerMetricsMessage (
854+ speakers = list (self ._session_speakers .values ()),
855+ ),
856+ )
857+
810858 # ============================================================================
811859 # TRANSCRIPT PROCESSING
812860 # ============================================================================
@@ -950,9 +998,7 @@ async def _add_speech_fragments(self, message: dict[str, Any], is_final: bool =
950998 self ._logger .debug (json .dumps (debug_payload ))
951999
9521000 # Update TTFB (only if there are listeners)
953- if not is_final and (
954- self .listeners (AgentServerMessageType .TTFB_METRICS ) or self .listeners (AgentServerMessageType .METRICS )
955- ):
1001+ if not is_final :
9561002 self ._calculate_ttfb (end_time = payload_end_time )
9571003
9581004 # Fragments available
@@ -1172,39 +1218,8 @@ async def _emit_segments(self, finalize: bool = False, end_of_turn: bool = False
11721218 self ._turn_start_time = self ._current_view .start_time
11731219
11741220 # Send updated speaker metrics
1175- if self ._dz_enabled and self .listeners (AgentServerMessageType .SPEAKER_METRICS ):
1176- """Update the metrics of the speakers in the sesseion."""
1177-
1178- # Finalized words
1179- final_words = [
1180- f
1181- for seg in final_segments
1182- for f in seg .fragments
1183- if f .type_ == "word" and f .speaker is not None
1184- ]
1185-
1186- # Only process if we have words
1187- if final_words :
1188- # Update the metrics of the speakers in the session
1189- for frag in final_words :
1190- # Check we have a speaker
1191- if frag .speaker is None :
1192- continue
1193-
1194- # Create new speaker
1195- if frag .speaker not in self ._session_speakers :
1196- self ._session_speakers [frag .speaker ] = SessionSpeaker (speaker_id = frag .speaker )
1197-
1198- # Update metrics
1199- self ._session_speakers [frag .speaker ].word_count += 1
1200- self ._session_speakers [frag .speaker ].last_heard = frag .end_time
1201-
1202- # Emit
1203- self ._emit_message (
1204- SpeakerMetricsMessage (
1205- speakers = list (self ._session_speakers .values ()),
1206- ),
1207- )
1221+ if self ._dz_enabled :
1222+ self .calculate_speaker_metrics (final_segments )
12081223
12091224 # Emit END_OF_TURN
12101225 if end_of_turn and self ._previous_view :
@@ -1265,7 +1280,6 @@ async def _calculate_finalize_delay(
12651280 # Calculations
12661281 clamped_delay : float = self ._config .end_of_utterance_max_delay
12671282 finalize_delay : Optional [float ] = None
1268- time_slip : Optional [float ] = None
12691283
12701284 # Reasons for the calculation
12711285 reasons : list [tuple [float , str ]] = []
@@ -1327,11 +1341,8 @@ def add_multipler_reason(multiplier: float, reason: str) -> None:
13271341 # Clamp to max delay
13281342 clamped_delay = min (delay , self ._config .end_of_utterance_max_delay )
13291343
1330- # Establish the real-world time
1331- time_slip = max (self ._total_time - self ._last_fragment_end_time , 0 )
1332-
13331344 # Adjust time and make sure no less than 25ms
1334- finalize_delay = max (clamped_delay - time_slip , 0.025 )
1345+ finalize_delay = max (clamped_delay - ( self . _last_ttfb / 1000 ) , 0.025 )
13351346
13361347 # Emit prediction
13371348 if self .listeners (AgentServerMessageType .END_OF_TURN_PREDICTION ):
@@ -1340,7 +1351,6 @@ def add_multipler_reason(multiplier: float, reason: str) -> None:
13401351 turn_id = self ._turn_id ,
13411352 metadata = TurnPredictionMetadata (
13421353 ttl = round (finalize_delay , 2 ),
1343- time_slip = round (time_slip , 2 ),
13441354 reasons = [_reason for _ , _reason in reasons ],
13451355 ),
13461356 ),
@@ -1459,12 +1469,14 @@ def _vad_evaluation(self, fragments: list[SpeechFragment]) -> None:
14591469 ),
14601470 )
14611471
1472+ # Update current speaker
1473+ self ._current_speaker = latest_speaker
1474+
14621475 # No further processing if we have no new fragments and we are not speaking
14631476 if has_valid_partial == current_is_speaking :
14641477 return
14651478
1466- # Update current speaker + speaking states
1467- self ._current_speaker = latest_speaker
1479+ # Update speaking state
14681480 self ._is_speaking = not current_is_speaking
14691481
14701482 # Event time
0 commit comments