openai · adityasingh2400 · May 8, 2026 · May 11, 2026
diff --git a/src/agents/realtime/_util.py b/src/agents/realtime/_util.py
@@ -1,19 +1,54 @@
 from __future__ import annotations
 
+from collections.abc import Mapping
+from typing import Any
+
+try:
+    from openai.types.realtime.realtime_audio_formats import (
+        AudioPCMA,
+        AudioPCMU,
+    )
+except ImportError:  # pragma: no cover - openai package missing the type
+    AudioPCMU = None  # type: ignore[assignment,misc]
+    AudioPCMA = None  # type: ignore[assignment,misc]
+
 from .config import RealtimeAudioFormat
 
 PCM16_SAMPLE_RATE_HZ = 24_000
 PCM16_SAMPLE_WIDTH_BYTES = 2
 G711_SAMPLE_RATE_HZ = 8_000
 
 
+def _is_g711_format(format: RealtimeAudioFormat | None) -> bool:
+    """Return True if `format` represents a G.711 audio stream in any shape."""
+    if format is None:
+        return False
+    # Match the typed models first: their generated `type` field is Optional and
+    # defaults to None, so a `AudioPCMU()` / `AudioPCMA()` instance has nothing
+    # for the string-based check below to inspect.
+    if AudioPCMU is not None and isinstance(format, AudioPCMU):
+        return True
+    if AudioPCMA is not None and isinstance(format, AudioPCMA):
+        return True
+    if isinstance(format, str):
+        text = format.lower()
+        return text.startswith("g711") or text in ("audio/pcmu", "audio/pcma")
+    type_value: Any
+    if isinstance(format, Mapping):
+        type_value = format.get("type")
+    else:
+        type_value = getattr(format, "type", None)
+    if not isinstance(type_value, str):
+        return False
+    text = type_value.lower()
+    return text.startswith("g711") or text in ("audio/pcmu", "audio/pcma")
+
+
 def calculate_audio_length_ms(format: RealtimeAudioFormat | None, audio_bytes: bytes) -> float:
     if not audio_bytes:
         return 0.0
 
-    normalized_format = format.lower() if isinstance(format, str) else None
-
-    if normalized_format and normalized_format.startswith("g711"):
+    if _is_g711_format(format):
         return (len(audio_bytes) / G711_SAMPLE_RATE_HZ) * 1000
 
     samples = len(audio_bytes) / PCM16_SAMPLE_WIDTH_BYTES

diff --git a/tests/realtime/test_playback_tracker.py b/tests/realtime/test_playback_tracker.py
@@ -178,3 +178,63 @@ def test_audio_length_calculation_with_different_formats(self):
         # Test None format (defaults to PCM)
         none_length = calculate_audio_length_ms(None, pcm_bytes)
         assert none_length == pytest.approx(expected_pcm, rel=0, abs=1e-6)
+
+    def test_audio_length_calculation_handles_typed_and_mapping_g711_formats(self):
+        """g711 audio passed as a typed pydantic model, Mapping, or ``audio/pcm*`` string
+        must be measured at the g711 sample rate.
+
+        ``RealtimePlaybackTracker.set_audio_format`` and ``ModelAudioTracker.set_audio_format``
+        accept ``RealtimeAudioFormat``, which is ``str | Mapping | AudioPCM/PCMU/PCMA``.
+        Previously the length calculator only special-cased strings starting with
+        ``g711``, so typed/Mapping g711 formats and the ``audio/pcmu``/``audio/pcma``
+        strings silently fell back to PCM-24kHz math, yielding a ~6x wrong duration
+        and miscalculating truncation offsets on interrupt for SIP/Twilio sessions.
+        """
+        from openai.types.realtime.realtime_audio_formats import (
+            AudioPCM,
+            AudioPCMA,
+            AudioPCMU,
+        )
+
+        from agents.realtime._util import calculate_audio_length_ms
+
+        audio_bytes = b"x" * 80  # at g711 8kHz: 10ms
+        expected_g711 = (len(audio_bytes) / 8_000) * 1000
+        expected_pcm = (len(audio_bytes) / (24_000 * 2)) * 1000
+
+        # Typed pydantic models for g711 should resolve to g711 sample rate.
+        assert calculate_audio_length_ms(
+            AudioPCMU(type="audio/pcmu"), audio_bytes
+        ) == pytest.approx(expected_g711, rel=0, abs=1e-6)
+        assert calculate_audio_length_ms(
+            AudioPCMA(type="audio/pcma"), audio_bytes
+        ) == pytest.approx(expected_g711, rel=0, abs=1e-6)
+        # Typed PCM and Mapping/string equivalents stay on the PCM path.
+        assert calculate_audio_length_ms(
+            AudioPCM(type="audio/pcm", rate=24000), audio_bytes
+        ) == pytest.approx(expected_pcm, rel=0, abs=1e-6)
+
+        # Mapping forms (as accepted by RealtimeAudioFormat).
+        assert calculate_audio_length_ms({"type": "audio/pcmu"}, audio_bytes) == pytest.approx(
+            expected_g711, rel=0, abs=1e-6
+        )
+        assert calculate_audio_length_ms({"type": "audio/pcma"}, audio_bytes) == pytest.approx(
+            expected_g711, rel=0, abs=1e-6
+        )
+
+        # API-style ``audio/pcm*`` strings should also be honored.
+        assert calculate_audio_length_ms("audio/pcmu", audio_bytes) == pytest.approx(
+            expected_g711, rel=0, abs=1e-6
+        )
+        assert calculate_audio_length_ms("audio/pcma", audio_bytes) == pytest.approx(
+            expected_g711, rel=0, abs=1e-6
+        )
+
+        # AudioPCMU/AudioPCMA have an Optional `type` field that defaults to None.
+        # The typed-model match must classify them as G.711 even without `type` set.
+        assert calculate_audio_length_ms(AudioPCMU(), audio_bytes) == pytest.approx(
+            expected_g711, rel=0, abs=1e-6
+        )
+        assert calculate_audio_length_ms(AudioPCMA(), audio_bytes) == pytest.approx(
+            expected_g711, rel=0, abs=1e-6
+        )