Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 38 additions & 3 deletions src/agents/realtime/_util.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,54 @@
from __future__ import annotations

from collections.abc import Mapping
from typing import Any

try:
from openai.types.realtime.realtime_audio_formats import (
AudioPCMA,
AudioPCMU,
)
except ImportError: # pragma: no cover - openai package missing the type
AudioPCMU = None # type: ignore[assignment,misc]
AudioPCMA = None # type: ignore[assignment,misc]

from .config import RealtimeAudioFormat

PCM16_SAMPLE_RATE_HZ = 24_000
PCM16_SAMPLE_WIDTH_BYTES = 2
G711_SAMPLE_RATE_HZ = 8_000


def _is_g711_format(format: RealtimeAudioFormat | None) -> bool:
"""Return True if `format` represents a G.711 audio stream in any shape."""
if format is None:
return False
# Match the typed models first: their generated `type` field is Optional and
# defaults to None, so a `AudioPCMU()` / `AudioPCMA()` instance has nothing
# for the string-based check below to inspect.
if AudioPCMU is not None and isinstance(format, AudioPCMU):
return True
if AudioPCMA is not None and isinstance(format, AudioPCMA):
return True
if isinstance(format, str):
text = format.lower()
return text.startswith("g711") or text in ("audio/pcmu", "audio/pcma")
type_value: Any
if isinstance(format, Mapping):
type_value = format.get("type")
else:
type_value = getattr(format, "type", None)
if not isinstance(type_value, str):
return False
text = type_value.lower()
return text.startswith("g711") or text in ("audio/pcmu", "audio/pcma")


def calculate_audio_length_ms(format: RealtimeAudioFormat | None, audio_bytes: bytes) -> float:
if not audio_bytes:
return 0.0

normalized_format = format.lower() if isinstance(format, str) else None

if normalized_format and normalized_format.startswith("g711"):
if _is_g711_format(format):
return (len(audio_bytes) / G711_SAMPLE_RATE_HZ) * 1000

samples = len(audio_bytes) / PCM16_SAMPLE_WIDTH_BYTES
Expand Down
60 changes: 60 additions & 0 deletions tests/realtime/test_playback_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,3 +178,63 @@ def test_audio_length_calculation_with_different_formats(self):
# Test None format (defaults to PCM)
none_length = calculate_audio_length_ms(None, pcm_bytes)
assert none_length == pytest.approx(expected_pcm, rel=0, abs=1e-6)

def test_audio_length_calculation_handles_typed_and_mapping_g711_formats(self):
"""g711 audio passed as a typed pydantic model, Mapping, or ``audio/pcm*`` string
must be measured at the g711 sample rate.

``RealtimePlaybackTracker.set_audio_format`` and ``ModelAudioTracker.set_audio_format``
accept ``RealtimeAudioFormat``, which is ``str | Mapping | AudioPCM/PCMU/PCMA``.
Previously the length calculator only special-cased strings starting with
``g711``, so typed/Mapping g711 formats and the ``audio/pcmu``/``audio/pcma``
strings silently fell back to PCM-24kHz math, yielding a ~6x wrong duration
and miscalculating truncation offsets on interrupt for SIP/Twilio sessions.
"""
from openai.types.realtime.realtime_audio_formats import (
AudioPCM,
AudioPCMA,
AudioPCMU,
)

from agents.realtime._util import calculate_audio_length_ms

audio_bytes = b"x" * 80 # at g711 8kHz: 10ms
expected_g711 = (len(audio_bytes) / 8_000) * 1000
expected_pcm = (len(audio_bytes) / (24_000 * 2)) * 1000

# Typed pydantic models for g711 should resolve to g711 sample rate.
assert calculate_audio_length_ms(
AudioPCMU(type="audio/pcmu"), audio_bytes
) == pytest.approx(expected_g711, rel=0, abs=1e-6)
assert calculate_audio_length_ms(
AudioPCMA(type="audio/pcma"), audio_bytes
) == pytest.approx(expected_g711, rel=0, abs=1e-6)
# Typed PCM and Mapping/string equivalents stay on the PCM path.
assert calculate_audio_length_ms(
AudioPCM(type="audio/pcm", rate=24000), audio_bytes
) == pytest.approx(expected_pcm, rel=0, abs=1e-6)

# Mapping forms (as accepted by RealtimeAudioFormat).
assert calculate_audio_length_ms({"type": "audio/pcmu"}, audio_bytes) == pytest.approx(
expected_g711, rel=0, abs=1e-6
)
assert calculate_audio_length_ms({"type": "audio/pcma"}, audio_bytes) == pytest.approx(
expected_g711, rel=0, abs=1e-6
)

# API-style ``audio/pcm*`` strings should also be honored.
assert calculate_audio_length_ms("audio/pcmu", audio_bytes) == pytest.approx(
expected_g711, rel=0, abs=1e-6
)
assert calculate_audio_length_ms("audio/pcma", audio_bytes) == pytest.approx(
expected_g711, rel=0, abs=1e-6
)

# AudioPCMU/AudioPCMA have an Optional `type` field that defaults to None.
# The typed-model match must classify them as G.711 even without `type` set.
assert calculate_audio_length_ms(AudioPCMU(), audio_bytes) == pytest.approx(
expected_g711, rel=0, abs=1e-6
)
assert calculate_audio_length_ms(AudioPCMA(), audio_bytes) == pytest.approx(
expected_g711, rel=0, abs=1e-6
)