diff --git a/src/agents/realtime/_util.py b/src/agents/realtime/_util.py index 4de38f06fc..358202a9de 100644 --- a/src/agents/realtime/_util.py +++ b/src/agents/realtime/_util.py @@ -1,5 +1,17 @@ from __future__ import annotations +from collections.abc import Mapping +from typing import Any + +try: + from openai.types.realtime.realtime_audio_formats import ( + AudioPCMA, + AudioPCMU, + ) +except ImportError: # pragma: no cover - openai package missing the type + AudioPCMU = None # type: ignore[assignment,misc] + AudioPCMA = None # type: ignore[assignment,misc] + from .config import RealtimeAudioFormat PCM16_SAMPLE_RATE_HZ = 24_000 @@ -7,13 +19,36 @@ G711_SAMPLE_RATE_HZ = 8_000 +def _is_g711_format(format: RealtimeAudioFormat | None) -> bool: + """Return True if `format` represents a G.711 audio stream in any shape.""" + if format is None: + return False + # Match the typed models first: their generated `type` field is Optional and + # defaults to None, so a `AudioPCMU()` / `AudioPCMA()` instance has nothing + # for the string-based check below to inspect. + if AudioPCMU is not None and isinstance(format, AudioPCMU): + return True + if AudioPCMA is not None and isinstance(format, AudioPCMA): + return True + if isinstance(format, str): + text = format.lower() + return text.startswith("g711") or text in ("audio/pcmu", "audio/pcma") + type_value: Any + if isinstance(format, Mapping): + type_value = format.get("type") + else: + type_value = getattr(format, "type", None) + if not isinstance(type_value, str): + return False + text = type_value.lower() + return text.startswith("g711") or text in ("audio/pcmu", "audio/pcma") + + def calculate_audio_length_ms(format: RealtimeAudioFormat | None, audio_bytes: bytes) -> float: if not audio_bytes: return 0.0 - normalized_format = format.lower() if isinstance(format, str) else None - - if normalized_format and normalized_format.startswith("g711"): + if _is_g711_format(format): return (len(audio_bytes) / G711_SAMPLE_RATE_HZ) * 1000 samples = len(audio_bytes) / PCM16_SAMPLE_WIDTH_BYTES diff --git a/tests/realtime/test_playback_tracker.py b/tests/realtime/test_playback_tracker.py index a0a284b17a..3274c0e236 100644 --- a/tests/realtime/test_playback_tracker.py +++ b/tests/realtime/test_playback_tracker.py @@ -178,3 +178,63 @@ def test_audio_length_calculation_with_different_formats(self): # Test None format (defaults to PCM) none_length = calculate_audio_length_ms(None, pcm_bytes) assert none_length == pytest.approx(expected_pcm, rel=0, abs=1e-6) + + def test_audio_length_calculation_handles_typed_and_mapping_g711_formats(self): + """g711 audio passed as a typed pydantic model, Mapping, or ``audio/pcm*`` string + must be measured at the g711 sample rate. + + ``RealtimePlaybackTracker.set_audio_format`` and ``ModelAudioTracker.set_audio_format`` + accept ``RealtimeAudioFormat``, which is ``str | Mapping | AudioPCM/PCMU/PCMA``. + Previously the length calculator only special-cased strings starting with + ``g711``, so typed/Mapping g711 formats and the ``audio/pcmu``/``audio/pcma`` + strings silently fell back to PCM-24kHz math, yielding a ~6x wrong duration + and miscalculating truncation offsets on interrupt for SIP/Twilio sessions. + """ + from openai.types.realtime.realtime_audio_formats import ( + AudioPCM, + AudioPCMA, + AudioPCMU, + ) + + from agents.realtime._util import calculate_audio_length_ms + + audio_bytes = b"x" * 80 # at g711 8kHz: 10ms + expected_g711 = (len(audio_bytes) / 8_000) * 1000 + expected_pcm = (len(audio_bytes) / (24_000 * 2)) * 1000 + + # Typed pydantic models for g711 should resolve to g711 sample rate. + assert calculate_audio_length_ms( + AudioPCMU(type="audio/pcmu"), audio_bytes + ) == pytest.approx(expected_g711, rel=0, abs=1e-6) + assert calculate_audio_length_ms( + AudioPCMA(type="audio/pcma"), audio_bytes + ) == pytest.approx(expected_g711, rel=0, abs=1e-6) + # Typed PCM and Mapping/string equivalents stay on the PCM path. + assert calculate_audio_length_ms( + AudioPCM(type="audio/pcm", rate=24000), audio_bytes + ) == pytest.approx(expected_pcm, rel=0, abs=1e-6) + + # Mapping forms (as accepted by RealtimeAudioFormat). + assert calculate_audio_length_ms({"type": "audio/pcmu"}, audio_bytes) == pytest.approx( + expected_g711, rel=0, abs=1e-6 + ) + assert calculate_audio_length_ms({"type": "audio/pcma"}, audio_bytes) == pytest.approx( + expected_g711, rel=0, abs=1e-6 + ) + + # API-style ``audio/pcm*`` strings should also be honored. + assert calculate_audio_length_ms("audio/pcmu", audio_bytes) == pytest.approx( + expected_g711, rel=0, abs=1e-6 + ) + assert calculate_audio_length_ms("audio/pcma", audio_bytes) == pytest.approx( + expected_g711, rel=0, abs=1e-6 + ) + + # AudioPCMU/AudioPCMA have an Optional `type` field that defaults to None. + # The typed-model match must classify them as G.711 even without `type` set. + assert calculate_audio_length_ms(AudioPCMU(), audio_bytes) == pytest.approx( + expected_g711, rel=0, abs=1e-6 + ) + assert calculate_audio_length_ms(AudioPCMA(), audio_bytes) == pytest.approx( + expected_g711, rel=0, abs=1e-6 + )